LLVM 20.0.0git
MemProfContextDisambiguation.cpp
Go to the documentation of this file.
1//==-- MemProfContextDisambiguation.cpp - Disambiguate contexts -------------=//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file implements support for context disambiguation of allocation
10// calls for profile guided heap optimization. Specifically, it uses Memprof
11// profiles which indicate context specific allocation behavior (currently
12// distinguishing cold vs hot memory allocations). Cloning is performed to
13// expose the cold allocation call contexts, and the allocation calls are
14// subsequently annotated with an attribute for later transformation.
15//
16// The transformations can be performed either directly on IR (regular LTO), or
17// on a ThinLTO index (and later applied to the IR during the ThinLTO backend).
18// Both types of LTO operate on a the same base graph representation, which
19// uses CRTP to support either IR or Index formats.
20//
21//===----------------------------------------------------------------------===//
22
24#include "llvm/ADT/DenseMap.h"
25#include "llvm/ADT/DenseSet.h"
26#include "llvm/ADT/MapVector.h"
29#include "llvm/ADT/SmallSet.h"
31#include "llvm/ADT/Statistic.h"
37#include "llvm/IR/Module.h"
39#include "llvm/Pass.h"
43#include "llvm/Transforms/IPO.h"
47#include <deque>
48#include <sstream>
49#include <unordered_map>
50#include <vector>
51using namespace llvm;
52using namespace llvm::memprof;
53
54#define DEBUG_TYPE "memprof-context-disambiguation"
55
56STATISTIC(FunctionClonesAnalysis,
57 "Number of function clones created during whole program analysis");
58STATISTIC(FunctionClonesThinBackend,
59 "Number of function clones created during ThinLTO backend");
60STATISTIC(FunctionsClonedThinBackend,
61 "Number of functions that had clones created during ThinLTO backend");
62STATISTIC(AllocTypeNotCold, "Number of not cold static allocations (possibly "
63 "cloned) during whole program analysis");
64STATISTIC(AllocTypeCold, "Number of cold static allocations (possibly cloned) "
65 "during whole program analysis");
66STATISTIC(AllocTypeNotColdThinBackend,
67 "Number of not cold static allocations (possibly cloned) during "
68 "ThinLTO backend");
69STATISTIC(AllocTypeColdThinBackend, "Number of cold static allocations "
70 "(possibly cloned) during ThinLTO backend");
71STATISTIC(OrigAllocsThinBackend,
72 "Number of original (not cloned) allocations with memprof profiles "
73 "during ThinLTO backend");
75 AllocVersionsThinBackend,
76 "Number of allocation versions (including clones) during ThinLTO backend");
77STATISTIC(MaxAllocVersionsThinBackend,
78 "Maximum number of allocation versions created for an original "
79 "allocation during ThinLTO backend");
80STATISTIC(UnclonableAllocsThinBackend,
81 "Number of unclonable ambigous allocations during ThinLTO backend");
82STATISTIC(RemovedEdgesWithMismatchedCallees,
83 "Number of edges removed due to mismatched callees (profiled vs IR)");
84STATISTIC(FoundProfiledCalleeCount,
85 "Number of profiled callees found via tail calls");
86STATISTIC(FoundProfiledCalleeDepth,
87 "Aggregate depth of profiled callees found via tail calls");
88STATISTIC(FoundProfiledCalleeMaxDepth,
89 "Maximum depth of profiled callees found via tail calls");
90STATISTIC(FoundProfiledCalleeNonUniquelyCount,
91 "Number of profiled callees found via multiple tail call chains");
92
94 "memprof-dot-file-path-prefix", cl::init(""), cl::Hidden,
95 cl::value_desc("filename"),
96 cl::desc("Specify the path prefix of the MemProf dot files."));
97
98static cl::opt<bool> ExportToDot("memprof-export-to-dot", cl::init(false),
100 cl::desc("Export graph to dot files."));
101
102static cl::opt<bool>
103 DumpCCG("memprof-dump-ccg", cl::init(false), cl::Hidden,
104 cl::desc("Dump CallingContextGraph to stdout after each stage."));
105
106static cl::opt<bool>
107 VerifyCCG("memprof-verify-ccg", cl::init(false), cl::Hidden,
108 cl::desc("Perform verification checks on CallingContextGraph."));
109
110static cl::opt<bool>
111 VerifyNodes("memprof-verify-nodes", cl::init(false), cl::Hidden,
112 cl::desc("Perform frequent verification checks on nodes."));
113
115 "memprof-import-summary",
116 cl::desc("Import summary to use for testing the ThinLTO backend via opt"),
117 cl::Hidden);
118
120 TailCallSearchDepth("memprof-tail-call-search-depth", cl::init(5),
122 cl::desc("Max depth to recursively search for missing "
123 "frames through tail calls."));
124
125namespace llvm {
127 "enable-memprof-context-disambiguation", cl::init(false), cl::Hidden,
128 cl::ZeroOrMore, cl::desc("Enable MemProf context disambiguation"));
129
130// Indicate we are linking with an allocator that supports hot/cold operator
131// new interfaces.
133 "supports-hot-cold-new", cl::init(false), cl::Hidden,
134 cl::desc("Linking with hot/cold operator new interfaces"));
135
137 "memprof-require-definition-for-promotion", cl::init(false), cl::Hidden,
138 cl::desc(
139 "Require target function definition when promoting indirect calls"));
140} // namespace llvm
141
144
145namespace {
146/// CRTP base for graphs built from either IR or ThinLTO summary index.
147///
148/// The graph represents the call contexts in all memprof metadata on allocation
149/// calls, with nodes for the allocations themselves, as well as for the calls
150/// in each context. The graph is initially built from the allocation memprof
151/// metadata (or summary) MIBs. It is then updated to match calls with callsite
152/// metadata onto the nodes, updating it to reflect any inlining performed on
153/// those calls.
154///
155/// Each MIB (representing an allocation's call context with allocation
156/// behavior) is assigned a unique context id during the graph build. The edges
157/// and nodes in the graph are decorated with the context ids they carry. This
158/// is used to correctly update the graph when cloning is performed so that we
159/// can uniquify the context for a single (possibly cloned) allocation.
160template <typename DerivedCCG, typename FuncTy, typename CallTy>
161class CallsiteContextGraph {
162public:
163 CallsiteContextGraph() = default;
164 CallsiteContextGraph(const CallsiteContextGraph &) = default;
165 CallsiteContextGraph(CallsiteContextGraph &&) = default;
166
167 /// Main entry point to perform analysis and transformations on graph.
168 bool process();
169
170 /// Perform cloning on the graph necessary to uniquely identify the allocation
171 /// behavior of an allocation based on its context.
172 void identifyClones();
173
174 /// Assign callsite clones to functions, cloning functions as needed to
175 /// accommodate the combinations of their callsite clones reached by callers.
176 /// For regular LTO this clones functions and callsites in the IR, but for
177 /// ThinLTO the cloning decisions are noted in the summaries and later applied
178 /// in applyImport.
179 bool assignFunctions();
180
181 void dump() const;
182 void print(raw_ostream &OS) const;
183 void printTotalSizes(raw_ostream &OS) const;
184
186 const CallsiteContextGraph &CCG) {
187 CCG.print(OS);
188 return OS;
189 }
190
191 friend struct GraphTraits<
192 const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *>;
193 friend struct DOTGraphTraits<
194 const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *>;
195
196 void exportToDot(std::string Label) const;
197
198 /// Represents a function clone via FuncTy pointer and clone number pair.
199 struct FuncInfo final
200 : public std::pair<FuncTy *, unsigned /*Clone number*/> {
201 using Base = std::pair<FuncTy *, unsigned>;
202 FuncInfo(const Base &B) : Base(B) {}
203 FuncInfo(FuncTy *F = nullptr, unsigned CloneNo = 0) : Base(F, CloneNo) {}
204 explicit operator bool() const { return this->first != nullptr; }
205 FuncTy *func() const { return this->first; }
206 unsigned cloneNo() const { return this->second; }
207 };
208
209 /// Represents a callsite clone via CallTy and clone number pair.
210 struct CallInfo final : public std::pair<CallTy, unsigned /*Clone number*/> {
211 using Base = std::pair<CallTy, unsigned>;
212 CallInfo(const Base &B) : Base(B) {}
213 CallInfo(CallTy Call = nullptr, unsigned CloneNo = 0)
214 : Base(Call, CloneNo) {}
215 explicit operator bool() const { return (bool)this->first; }
216 CallTy call() const { return this->first; }
217 unsigned cloneNo() const { return this->second; }
218 void setCloneNo(unsigned N) { this->second = N; }
219 void print(raw_ostream &OS) const {
220 if (!operator bool()) {
221 assert(!cloneNo());
222 OS << "null Call";
223 return;
224 }
225 call()->print(OS);
226 OS << "\t(clone " << cloneNo() << ")";
227 }
228 void dump() const {
229 print(dbgs());
230 dbgs() << "\n";
231 }
232 friend raw_ostream &operator<<(raw_ostream &OS, const CallInfo &Call) {
233 Call.print(OS);
234 return OS;
235 }
236 };
237
238 struct ContextEdge;
239
240 /// Node in the Callsite Context Graph
241 struct ContextNode {
242 // Keep this for now since in the IR case where we have an Instruction* it
243 // is not as immediately discoverable. Used for printing richer information
244 // when dumping graph.
245 bool IsAllocation;
246
247 // Keeps track of when the Call was reset to null because there was
248 // recursion.
249 bool Recursive = false;
250
251 // This will be formed by ORing together the AllocationType enum values
252 // for contexts including this node.
253 uint8_t AllocTypes = 0;
254
255 // The corresponding allocation or interior call. This is the primary call
256 // for which we have created this node.
258
259 // List of other calls that can be treated the same as the primary call
260 // through cloning. I.e. located in the same function and have the same
261 // (possibly pruned) stack ids. They will be updated the same way as the
262 // primary call when assigning to function clones.
263 SmallVector<CallInfo, 0> MatchingCalls;
264
265 // For alloc nodes this is a unique id assigned when constructed, and for
266 // callsite stack nodes it is the original stack id when the node is
267 // constructed from the memprof MIB metadata on the alloc nodes. Note that
268 // this is only used when matching callsite metadata onto the stack nodes
269 // created when processing the allocation memprof MIBs, and for labeling
270 // nodes in the dot graph. Therefore we don't bother to assign a value for
271 // clones.
272 uint64_t OrigStackOrAllocId = 0;
273
274 // Edges to all callees in the profiled call stacks.
275 // TODO: Should this be a map (from Callee node) for more efficient lookup?
276 std::vector<std::shared_ptr<ContextEdge>> CalleeEdges;
277
278 // Edges to all callers in the profiled call stacks.
279 // TODO: Should this be a map (from Caller node) for more efficient lookup?
280 std::vector<std::shared_ptr<ContextEdge>> CallerEdges;
281
282 // Get the list of edges from which we can compute allocation information
283 // such as the context ids and allocation type of this node.
284 const std::vector<std::shared_ptr<ContextEdge>> *
285 getEdgesWithAllocInfo() const {
286 // If node has any callees, compute from those, otherwise compute from
287 // callers (i.e. if this is the leaf allocation node).
288 if (!CalleeEdges.empty())
289 return &CalleeEdges;
290 if (!CallerEdges.empty()) {
291 // A node with caller edges but no callee edges must be the allocation
292 // node.
293 assert(IsAllocation);
294 return &CallerEdges;
295 }
296 return nullptr;
297 }
298
299 // Compute the context ids for this node from the union of its edge context
300 // ids.
301 DenseSet<uint32_t> getContextIds() const {
302 DenseSet<uint32_t> ContextIds;
303 auto *Edges = getEdgesWithAllocInfo();
304 if (!Edges)
305 return {};
306 unsigned Count = 0;
307 for (auto &Edge : *Edges)
308 Count += Edge->getContextIds().size();
309 ContextIds.reserve(Count);
310 for (auto &Edge : *Edges)
311 ContextIds.insert(Edge->getContextIds().begin(),
312 Edge->getContextIds().end());
313 return ContextIds;
314 }
315
316 // Compute the allocation type for this node from the OR of its edge
317 // allocation types.
318 uint8_t computeAllocType() const {
319 auto *Edges = getEdgesWithAllocInfo();
320 if (!Edges)
321 return (uint8_t)AllocationType::None;
322 uint8_t BothTypes =
323 (uint8_t)AllocationType::Cold | (uint8_t)AllocationType::NotCold;
324 uint8_t AllocType = (uint8_t)AllocationType::None;
325 for (auto &Edge : *Edges) {
326 AllocType |= Edge->AllocTypes;
327 // Bail early if alloc type reached both, no further refinement.
328 if (AllocType == BothTypes)
329 return AllocType;
330 }
331 return AllocType;
332 }
333
334 // The context ids set for this node is empty if its edge context ids are
335 // also all empty.
336 bool emptyContextIds() const {
337 auto *Edges = getEdgesWithAllocInfo();
338 if (!Edges)
339 return true;
340 for (auto &Edge : *Edges) {
341 if (!Edge->getContextIds().empty())
342 return false;
343 }
344 return true;
345 }
346
347 // List of clones of this ContextNode, initially empty.
348 std::vector<ContextNode *> Clones;
349
350 // If a clone, points to the original uncloned node.
351 ContextNode *CloneOf = nullptr;
352
353 ContextNode(bool IsAllocation) : IsAllocation(IsAllocation), Call() {}
354
355 ContextNode(bool IsAllocation, CallInfo C)
356 : IsAllocation(IsAllocation), Call(C) {}
357
358 void addClone(ContextNode *Clone) {
359 if (CloneOf) {
360 CloneOf->Clones.push_back(Clone);
361 Clone->CloneOf = CloneOf;
362 } else {
363 Clones.push_back(Clone);
364 assert(!Clone->CloneOf);
365 Clone->CloneOf = this;
366 }
367 }
368
369 ContextNode *getOrigNode() {
370 if (!CloneOf)
371 return this;
372 return CloneOf;
373 }
374
375 void addOrUpdateCallerEdge(ContextNode *Caller, AllocationType AllocType,
376 unsigned int ContextId);
377
378 ContextEdge *findEdgeFromCallee(const ContextNode *Callee);
379 ContextEdge *findEdgeFromCaller(const ContextNode *Caller);
380 void eraseCalleeEdge(const ContextEdge *Edge);
381 void eraseCallerEdge(const ContextEdge *Edge);
382
383 void setCall(CallInfo C) { Call = C; }
384
385 bool hasCall() const { return (bool)Call.call(); }
386
387 void printCall(raw_ostream &OS) const { Call.print(OS); }
388
389 // True if this node was effectively removed from the graph, in which case
390 // it should have an allocation type of None and empty context ids.
391 bool isRemoved() const {
392 assert((AllocTypes == (uint8_t)AllocationType::None) ==
393 emptyContextIds());
394 return AllocTypes == (uint8_t)AllocationType::None;
395 }
396
397 void dump() const;
398 void print(raw_ostream &OS) const;
399
400 friend raw_ostream &operator<<(raw_ostream &OS, const ContextNode &Node) {
401 Node.print(OS);
402 return OS;
403 }
404 };
405
406 /// Edge in the Callsite Context Graph from a ContextNode N to a caller or
407 /// callee.
408 struct ContextEdge {
409 ContextNode *Callee;
410 ContextNode *Caller;
411
412 // This will be formed by ORing together the AllocationType enum values
413 // for contexts including this edge.
414 uint8_t AllocTypes = 0;
415
416 // The set of IDs for contexts including this edge.
417 DenseSet<uint32_t> ContextIds;
418
419 ContextEdge(ContextNode *Callee, ContextNode *Caller, uint8_t AllocType,
420 DenseSet<uint32_t> ContextIds)
421 : Callee(Callee), Caller(Caller), AllocTypes(AllocType),
422 ContextIds(std::move(ContextIds)) {}
423
424 DenseSet<uint32_t> &getContextIds() { return ContextIds; }
425
426 // Helper to clear the fields of this edge when we are removing it from the
427 // graph.
428 inline void clear() {
429 ContextIds.clear();
430 AllocTypes = (uint8_t)AllocationType::None;
431 Caller = nullptr;
432 Callee = nullptr;
433 }
434
435 // Check if edge was removed from the graph. This is useful while iterating
436 // over a copy of edge lists when performing operations that mutate the
437 // graph in ways that might remove one of the edges.
438 inline bool isRemoved() const {
439 if (Callee || Caller)
440 return false;
441 // Any edges that have been removed from the graph but are still in a
442 // shared_ptr somewhere should have all fields null'ed out by clear()
443 // above.
444 assert(AllocTypes == (uint8_t)AllocationType::None);
445 assert(ContextIds.empty());
446 return true;
447 }
448
449 void dump() const;
450 void print(raw_ostream &OS) const;
451
452 friend raw_ostream &operator<<(raw_ostream &OS, const ContextEdge &Edge) {
453 Edge.print(OS);
454 return OS;
455 }
456 };
457
458 /// Helpers to remove edges that have allocation type None (due to not
459 /// carrying any context ids) after transformations.
460 void removeNoneTypeCalleeEdges(ContextNode *Node);
461 void removeNoneTypeCallerEdges(ContextNode *Node);
462 void
463 recursivelyRemoveNoneTypeCalleeEdges(ContextNode *Node,
465
466protected:
467 /// Get a list of nodes corresponding to the stack ids in the given callsite
468 /// context.
469 template <class NodeT, class IteratorT>
470 std::vector<uint64_t>
471 getStackIdsWithContextNodes(CallStack<NodeT, IteratorT> &CallsiteContext);
472
473 /// Adds nodes for the given allocation and any stack ids on its memprof MIB
474 /// metadata (or summary).
475 ContextNode *addAllocNode(CallInfo Call, const FuncTy *F);
476
477 /// Adds nodes for the given MIB stack ids.
478 template <class NodeT, class IteratorT>
479 void addStackNodesForMIB(ContextNode *AllocNode,
480 CallStack<NodeT, IteratorT> &StackContext,
481 CallStack<NodeT, IteratorT> &CallsiteContext,
483 ArrayRef<ContextTotalSize> ContextSizeInfo);
484
485 /// Matches all callsite metadata (or summary) to the nodes created for
486 /// allocation memprof MIB metadata, synthesizing new nodes to reflect any
487 /// inlining performed on those callsite instructions.
488 void updateStackNodes();
489
490 /// Update graph to conservatively handle any callsite stack nodes that target
491 /// multiple different callee target functions.
492 void handleCallsitesWithMultipleTargets();
493
494 // Try to partition calls on the given node (already placed into the AllCalls
495 // array) by callee function, creating new copies of Node as needed to hold
496 // calls with different callees, and moving the callee edges appropriately.
497 // Returns true if partitioning was successful.
498 bool partitionCallsByCallee(
499 ContextNode *Node, ArrayRef<CallInfo> AllCalls,
500 std::vector<std::pair<CallInfo, ContextNode *>> &NewCallToNode);
501
502 /// Save lists of calls with MemProf metadata in each function, for faster
503 /// iteration.
504 MapVector<FuncTy *, std::vector<CallInfo>> FuncToCallsWithMetadata;
505
506 /// Map from callsite node to the enclosing caller function.
507 std::map<const ContextNode *, const FuncTy *> NodeToCallingFunc;
508
509private:
510 using EdgeIter = typename std::vector<std::shared_ptr<ContextEdge>>::iterator;
511
512 // Structure to keep track of information for each call as we are matching
513 // non-allocation callsites onto context nodes created from the allocation
514 // call metadata / summary contexts.
515 struct CallContextInfo {
516 // The callsite we're trying to match.
517 CallTy Call;
518 // The callsites stack ids that have a context node in the graph.
519 std::vector<uint64_t> StackIds;
520 // The function containing this callsite.
521 const FuncTy *Func;
522 // Initially empty, if needed this will be updated to contain the context
523 // ids for use in a new context node created for this callsite.
524 DenseSet<uint32_t> ContextIds;
525 };
526
527 /// Helper to remove edge from graph, updating edge iterator if it is provided
528 /// (in which case CalleeIter indicates which edge list is being iterated).
529 /// This will also perform the necessary clearing of the ContextEdge members
530 /// to enable later checking if the edge has been removed (since we may have
531 /// other copies of the shared_ptr in existence, and in fact rely on this to
532 /// enable removal while iterating over a copy of a node's edge list).
533 void removeEdgeFromGraph(ContextEdge *Edge, EdgeIter *EI = nullptr,
534 bool CalleeIter = true);
535
536 /// Assigns the given Node to calls at or inlined into the location with
537 /// the Node's stack id, after post order traversing and processing its
538 /// caller nodes. Uses the call information recorded in the given
539 /// StackIdToMatchingCalls map, and creates new nodes for inlined sequences
540 /// as needed. Called by updateStackNodes which sets up the given
541 /// StackIdToMatchingCalls map.
542 void assignStackNodesPostOrder(
543 ContextNode *Node, DenseSet<const ContextNode *> &Visited,
544 DenseMap<uint64_t, std::vector<CallContextInfo>> &StackIdToMatchingCalls,
545 DenseMap<CallInfo, CallInfo> &CallToMatchingCall);
546
547 /// Duplicates the given set of context ids, updating the provided
548 /// map from each original id with the newly generated context ids,
549 /// and returning the new duplicated id set.
550 DenseSet<uint32_t> duplicateContextIds(
551 const DenseSet<uint32_t> &StackSequenceContextIds,
552 DenseMap<uint32_t, DenseSet<uint32_t>> &OldToNewContextIds);
553
554 /// Propagates all duplicated context ids across the graph.
555 void propagateDuplicateContextIds(
556 const DenseMap<uint32_t, DenseSet<uint32_t>> &OldToNewContextIds);
557
558 /// Connect the NewNode to OrigNode's callees if TowardsCallee is true,
559 /// else to its callers. Also updates OrigNode's edges to remove any context
560 /// ids moved to the newly created edge.
561 void connectNewNode(ContextNode *NewNode, ContextNode *OrigNode,
562 bool TowardsCallee,
563 DenseSet<uint32_t> RemainingContextIds);
564
565 /// Get the stack id corresponding to the given Id or Index (for IR this will
566 /// return itself, for a summary index this will return the id recorded in the
567 /// index for that stack id index value).
568 uint64_t getStackId(uint64_t IdOrIndex) const {
569 return static_cast<const DerivedCCG *>(this)->getStackId(IdOrIndex);
570 }
571
572 /// Returns true if the given call targets the callee of the given edge, or if
573 /// we were able to identify the call chain through intermediate tail calls.
574 /// In the latter case new context nodes are added to the graph for the
575 /// identified tail calls, and their synthesized nodes are added to
576 /// TailCallToContextNodeMap. The EdgeIter is updated in the latter case for
577 /// the updated edges and to prepare it for an increment in the caller.
578 bool
579 calleesMatch(CallTy Call, EdgeIter &EI,
580 MapVector<CallInfo, ContextNode *> &TailCallToContextNodeMap);
581
582 // Return the callee function of the given call, or nullptr if it can't be
583 // determined
584 const FuncTy *getCalleeFunc(CallTy Call) {
585 return static_cast<DerivedCCG *>(this)->getCalleeFunc(Call);
586 }
587
588 /// Returns true if the given call targets the given function, or if we were
589 /// able to identify the call chain through intermediate tail calls (in which
590 /// case FoundCalleeChain will be populated).
591 bool calleeMatchesFunc(
592 CallTy Call, const FuncTy *Func, const FuncTy *CallerFunc,
593 std::vector<std::pair<CallTy, FuncTy *>> &FoundCalleeChain) {
594 return static_cast<DerivedCCG *>(this)->calleeMatchesFunc(
595 Call, Func, CallerFunc, FoundCalleeChain);
596 }
597
598 /// Returns true if both call instructions have the same callee.
599 bool sameCallee(CallTy Call1, CallTy Call2) {
600 return static_cast<DerivedCCG *>(this)->sameCallee(Call1, Call2);
601 }
602
603 /// Get a list of nodes corresponding to the stack ids in the given
604 /// callsite's context.
605 std::vector<uint64_t> getStackIdsWithContextNodesForCall(CallTy Call) {
606 return static_cast<DerivedCCG *>(this)->getStackIdsWithContextNodesForCall(
607 Call);
608 }
609
610 /// Get the last stack id in the context for callsite.
611 uint64_t getLastStackId(CallTy Call) {
612 return static_cast<DerivedCCG *>(this)->getLastStackId(Call);
613 }
614
615 /// Update the allocation call to record type of allocated memory.
616 void updateAllocationCall(CallInfo &Call, AllocationType AllocType) {
617 AllocType == AllocationType::Cold ? AllocTypeCold++ : AllocTypeNotCold++;
618 static_cast<DerivedCCG *>(this)->updateAllocationCall(Call, AllocType);
619 }
620
621 /// Get the AllocationType assigned to the given allocation instruction clone.
622 AllocationType getAllocationCallType(const CallInfo &Call) const {
623 return static_cast<const DerivedCCG *>(this)->getAllocationCallType(Call);
624 }
625
626 /// Update non-allocation call to invoke (possibly cloned) function
627 /// CalleeFunc.
628 void updateCall(CallInfo &CallerCall, FuncInfo CalleeFunc) {
629 static_cast<DerivedCCG *>(this)->updateCall(CallerCall, CalleeFunc);
630 }
631
632 /// Clone the given function for the given callsite, recording mapping of all
633 /// of the functions tracked calls to their new versions in the CallMap.
634 /// Assigns new clones to clone number CloneNo.
635 FuncInfo cloneFunctionForCallsite(
636 FuncInfo &Func, CallInfo &Call, std::map<CallInfo, CallInfo> &CallMap,
637 std::vector<CallInfo> &CallsWithMetadataInFunc, unsigned CloneNo) {
638 return static_cast<DerivedCCG *>(this)->cloneFunctionForCallsite(
639 Func, Call, CallMap, CallsWithMetadataInFunc, CloneNo);
640 }
641
642 /// Gets a label to use in the dot graph for the given call clone in the given
643 /// function.
644 std::string getLabel(const FuncTy *Func, const CallTy Call,
645 unsigned CloneNo) const {
646 return static_cast<const DerivedCCG *>(this)->getLabel(Func, Call, CloneNo);
647 }
648
649 // Create and return a new ContextNode.
650 ContextNode *createNewNode(bool IsAllocation, const FuncTy *F = nullptr,
651 CallInfo C = CallInfo()) {
652 NodeOwner.push_back(std::make_unique<ContextNode>(IsAllocation, C));
653 auto *NewNode = NodeOwner.back().get();
654 if (F)
655 NodeToCallingFunc[NewNode] = F;
656 return NewNode;
657 }
658
659 /// Helpers to find the node corresponding to the given call or stackid.
660 ContextNode *getNodeForInst(const CallInfo &C);
661 ContextNode *getNodeForAlloc(const CallInfo &C);
662 ContextNode *getNodeForStackId(uint64_t StackId);
663
664 /// Computes the alloc type corresponding to the given context ids, by
665 /// unioning their recorded alloc types.
666 uint8_t computeAllocType(DenseSet<uint32_t> &ContextIds);
667
668 /// Returns the allocation type of the intersection of the contexts of two
669 /// nodes (based on their provided context id sets), optimized for the case
670 /// when Node1Ids is smaller than Node2Ids.
671 uint8_t intersectAllocTypesImpl(const DenseSet<uint32_t> &Node1Ids,
672 const DenseSet<uint32_t> &Node2Ids);
673
674 /// Returns the allocation type of the intersection of the contexts of two
675 /// nodes (based on their provided context id sets).
676 uint8_t intersectAllocTypes(const DenseSet<uint32_t> &Node1Ids,
677 const DenseSet<uint32_t> &Node2Ids);
678
679 /// Create a clone of Edge's callee and move Edge to that new callee node,
680 /// performing the necessary context id and allocation type updates.
681 /// If callee's caller edge iterator is supplied, it is updated when removing
682 /// the edge from that list. If ContextIdsToMove is non-empty, only that
683 /// subset of Edge's ids are moved to an edge to the new callee.
684 ContextNode *
685 moveEdgeToNewCalleeClone(const std::shared_ptr<ContextEdge> &Edge,
686 EdgeIter *CallerEdgeI = nullptr,
687 DenseSet<uint32_t> ContextIdsToMove = {});
688
689 /// Change the callee of Edge to existing callee clone NewCallee, performing
690 /// the necessary context id and allocation type updates.
691 /// If callee's caller edge iterator is supplied, it is updated when removing
692 /// the edge from that list. If ContextIdsToMove is non-empty, only that
693 /// subset of Edge's ids are moved to an edge to the new callee.
694 void moveEdgeToExistingCalleeClone(const std::shared_ptr<ContextEdge> &Edge,
695 ContextNode *NewCallee,
696 EdgeIter *CallerEdgeI = nullptr,
697 bool NewClone = false,
698 DenseSet<uint32_t> ContextIdsToMove = {});
699
700 /// Change the caller of the edge at the given callee edge iterator to be
701 /// NewCaller, performing the necessary context id and allocation type
702 /// updates. The iterator is updated as the edge is removed from the list of
703 /// callee edges in the original caller. This is similar to the above
704 /// moveEdgeToExistingCalleeClone, but a simplified version of it as we always
705 /// move the given edge and all of its context ids.
706 void moveCalleeEdgeToNewCaller(EdgeIter &CalleeEdgeI, ContextNode *NewCaller);
707
708 /// Recursively perform cloning on the graph for the given Node and its
709 /// callers, in order to uniquely identify the allocation behavior of an
710 /// allocation given its context. The context ids of the allocation being
711 /// processed are given in AllocContextIds.
712 void identifyClones(ContextNode *Node, DenseSet<const ContextNode *> &Visited,
713 const DenseSet<uint32_t> &AllocContextIds);
714
715 /// Map from each context ID to the AllocationType assigned to that context.
716 DenseMap<uint32_t, AllocationType> ContextIdToAllocationType;
717
718 /// Map from each contextID to the profiled full contexts and their total
719 /// sizes (there may be more than one due to context trimming),
720 /// optionally populated when requested (via MemProfReportHintedSizes or
721 /// MinClonedColdBytePercent).
722 DenseMap<uint32_t, std::vector<ContextTotalSize>> ContextIdToContextSizeInfos;
723
724 /// Identifies the context node created for a stack id when adding the MIB
725 /// contexts to the graph. This is used to locate the context nodes when
726 /// trying to assign the corresponding callsites with those stack ids to these
727 /// nodes.
728 DenseMap<uint64_t, ContextNode *> StackEntryIdToContextNodeMap;
729
730 /// Maps to track the calls to their corresponding nodes in the graph.
731 MapVector<CallInfo, ContextNode *> AllocationCallToContextNodeMap;
732 MapVector<CallInfo, ContextNode *> NonAllocationCallToContextNodeMap;
733
734 /// Owner of all ContextNode unique_ptrs.
735 std::vector<std::unique_ptr<ContextNode>> NodeOwner;
736
737 /// Perform sanity checks on graph when requested.
738 void check() const;
739
740 /// Keeps track of the last unique context id assigned.
741 unsigned int LastContextId = 0;
742};
743
744template <typename DerivedCCG, typename FuncTy, typename CallTy>
745using ContextNode =
746 typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode;
747template <typename DerivedCCG, typename FuncTy, typename CallTy>
748using ContextEdge =
749 typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextEdge;
750template <typename DerivedCCG, typename FuncTy, typename CallTy>
751using FuncInfo =
752 typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::FuncInfo;
753template <typename DerivedCCG, typename FuncTy, typename CallTy>
754using CallInfo =
755 typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::CallInfo;
756
757/// CRTP derived class for graphs built from IR (regular LTO).
758class ModuleCallsiteContextGraph
759 : public CallsiteContextGraph<ModuleCallsiteContextGraph, Function,
760 Instruction *> {
761public:
762 ModuleCallsiteContextGraph(
763 Module &M,
765
766private:
767 friend CallsiteContextGraph<ModuleCallsiteContextGraph, Function,
768 Instruction *>;
769
770 uint64_t getStackId(uint64_t IdOrIndex) const;
771 const Function *getCalleeFunc(Instruction *Call);
772 bool calleeMatchesFunc(
773 Instruction *Call, const Function *Func, const Function *CallerFunc,
774 std::vector<std::pair<Instruction *, Function *>> &FoundCalleeChain);
775 bool sameCallee(Instruction *Call1, Instruction *Call2);
776 bool findProfiledCalleeThroughTailCalls(
777 const Function *ProfiledCallee, Value *CurCallee, unsigned Depth,
778 std::vector<std::pair<Instruction *, Function *>> &FoundCalleeChain,
779 bool &FoundMultipleCalleeChains);
780 uint64_t getLastStackId(Instruction *Call);
781 std::vector<uint64_t> getStackIdsWithContextNodesForCall(Instruction *Call);
782 void updateAllocationCall(CallInfo &Call, AllocationType AllocType);
783 AllocationType getAllocationCallType(const CallInfo &Call) const;
784 void updateCall(CallInfo &CallerCall, FuncInfo CalleeFunc);
785 CallsiteContextGraph<ModuleCallsiteContextGraph, Function,
786 Instruction *>::FuncInfo
787 cloneFunctionForCallsite(FuncInfo &Func, CallInfo &Call,
788 std::map<CallInfo, CallInfo> &CallMap,
789 std::vector<CallInfo> &CallsWithMetadataInFunc,
790 unsigned CloneNo);
791 std::string getLabel(const Function *Func, const Instruction *Call,
792 unsigned CloneNo) const;
793
794 const Module &Mod;
796};
797
798/// Represents a call in the summary index graph, which can either be an
799/// allocation or an interior callsite node in an allocation's context.
800/// Holds a pointer to the corresponding data structure in the index.
801struct IndexCall : public PointerUnion<CallsiteInfo *, AllocInfo *> {
802 IndexCall() : PointerUnion() {}
803 IndexCall(std::nullptr_t) : IndexCall() {}
805 IndexCall(AllocInfo *AllocNode) : PointerUnion(AllocNode) {}
806 IndexCall(PointerUnion PT) : PointerUnion(PT) {}
807
808 IndexCall *operator->() { return this; }
809
810 PointerUnion<CallsiteInfo *, AllocInfo *> getBase() const { return *this; }
811
812 void print(raw_ostream &OS) const {
813 if (auto *AI = llvm::dyn_cast_if_present<AllocInfo *>(getBase())) {
814 OS << *AI;
815 } else {
816 auto *CI = llvm::dyn_cast_if_present<CallsiteInfo *>(getBase());
817 assert(CI);
818 OS << *CI;
819 }
820 }
821};
822
823/// CRTP derived class for graphs built from summary index (ThinLTO).
824class IndexCallsiteContextGraph
825 : public CallsiteContextGraph<IndexCallsiteContextGraph, FunctionSummary,
826 IndexCall> {
827public:
828 IndexCallsiteContextGraph(
829 ModuleSummaryIndex &Index,
831 isPrevailing);
832
833 ~IndexCallsiteContextGraph() {
834 // Now that we are done with the graph it is safe to add the new
835 // CallsiteInfo structs to the function summary vectors. The graph nodes
836 // point into locations within these vectors, so we don't want to add them
837 // any earlier.
838 for (auto &I : FunctionCalleesToSynthesizedCallsiteInfos) {
839 auto *FS = I.first;
840 for (auto &Callsite : I.second)
841 FS->addCallsite(*Callsite.second);
842 }
843 }
844
845private:
846 friend CallsiteContextGraph<IndexCallsiteContextGraph, FunctionSummary,
847 IndexCall>;
848
849 uint64_t getStackId(uint64_t IdOrIndex) const;
850 const FunctionSummary *getCalleeFunc(IndexCall &Call);
851 bool calleeMatchesFunc(
852 IndexCall &Call, const FunctionSummary *Func,
853 const FunctionSummary *CallerFunc,
854 std::vector<std::pair<IndexCall, FunctionSummary *>> &FoundCalleeChain);
855 bool sameCallee(IndexCall &Call1, IndexCall &Call2);
856 bool findProfiledCalleeThroughTailCalls(
857 ValueInfo ProfiledCallee, ValueInfo CurCallee, unsigned Depth,
858 std::vector<std::pair<IndexCall, FunctionSummary *>> &FoundCalleeChain,
859 bool &FoundMultipleCalleeChains);
860 uint64_t getLastStackId(IndexCall &Call);
861 std::vector<uint64_t> getStackIdsWithContextNodesForCall(IndexCall &Call);
862 void updateAllocationCall(CallInfo &Call, AllocationType AllocType);
863 AllocationType getAllocationCallType(const CallInfo &Call) const;
864 void updateCall(CallInfo &CallerCall, FuncInfo CalleeFunc);
865 CallsiteContextGraph<IndexCallsiteContextGraph, FunctionSummary,
866 IndexCall>::FuncInfo
867 cloneFunctionForCallsite(FuncInfo &Func, CallInfo &Call,
868 std::map<CallInfo, CallInfo> &CallMap,
869 std::vector<CallInfo> &CallsWithMetadataInFunc,
870 unsigned CloneNo);
871 std::string getLabel(const FunctionSummary *Func, const IndexCall &Call,
872 unsigned CloneNo) const;
873
874 // Saves mapping from function summaries containing memprof records back to
875 // its VI, for use in checking and debugging.
876 std::map<const FunctionSummary *, ValueInfo> FSToVIMap;
877
880 isPrevailing;
881
882 // Saves/owns the callsite info structures synthesized for missing tail call
883 // frames that we discover while building the graph.
884 // It maps from the summary of the function making the tail call, to a map
885 // of callee ValueInfo to corresponding synthesized callsite info.
886 std::unordered_map<FunctionSummary *,
887 std::map<ValueInfo, std::unique_ptr<CallsiteInfo>>>
888 FunctionCalleesToSynthesizedCallsiteInfos;
889};
890} // namespace
891
892namespace llvm {
893template <>
894struct DenseMapInfo<typename CallsiteContextGraph<
895 ModuleCallsiteContextGraph, Function, Instruction *>::CallInfo>
897template <>
898struct DenseMapInfo<typename CallsiteContextGraph<
899 IndexCallsiteContextGraph, FunctionSummary, IndexCall>::CallInfo>
900 : public DenseMapInfo<std::pair<IndexCall, unsigned>> {};
901template <>
902struct DenseMapInfo<IndexCall>
903 : public DenseMapInfo<PointerUnion<CallsiteInfo *, AllocInfo *>> {};
904} // end namespace llvm
905
906namespace {
907
908// Map the uint8_t alloc types (which may contain NotCold|Cold) to the alloc
909// type we should actually use on the corresponding allocation.
910// If we can't clone a node that has NotCold+Cold alloc type, we will fall
911// back to using NotCold. So don't bother cloning to distinguish NotCold+Cold
912// from NotCold.
913AllocationType allocTypeToUse(uint8_t AllocTypes) {
914 assert(AllocTypes != (uint8_t)AllocationType::None);
915 if (AllocTypes ==
916 ((uint8_t)AllocationType::NotCold | (uint8_t)AllocationType::Cold))
917 return AllocationType::NotCold;
918 else
919 return (AllocationType)AllocTypes;
920}
921
922// Helper to check if the alloc types for all edges recorded in the
923// InAllocTypes vector match the alloc types for all edges in the Edges
924// vector.
925template <typename DerivedCCG, typename FuncTy, typename CallTy>
926bool allocTypesMatch(
927 const std::vector<uint8_t> &InAllocTypes,
928 const std::vector<std::shared_ptr<ContextEdge<DerivedCCG, FuncTy, CallTy>>>
929 &Edges) {
930 // This should be called only when the InAllocTypes vector was computed for
931 // this set of Edges. Make sure the sizes are the same.
932 assert(InAllocTypes.size() == Edges.size());
933 return std::equal(
934 InAllocTypes.begin(), InAllocTypes.end(), Edges.begin(), Edges.end(),
935 [](const uint8_t &l,
936 const std::shared_ptr<ContextEdge<DerivedCCG, FuncTy, CallTy>> &r) {
937 // Can share if one of the edges is None type - don't
938 // care about the type along that edge as it doesn't
939 // exist for those context ids.
940 if (l == (uint8_t)AllocationType::None ||
941 r->AllocTypes == (uint8_t)AllocationType::None)
942 return true;
943 return allocTypeToUse(l) == allocTypeToUse(r->AllocTypes);
944 });
945}
946
947// Helper to check if the alloc types for all edges recorded in the
948// InAllocTypes vector match the alloc types for callee edges in the given
949// clone. Because the InAllocTypes were computed from the original node's callee
950// edges, and other cloning could have happened after this clone was created, we
951// need to find the matching clone callee edge, which may or may not exist.
952template <typename DerivedCCG, typename FuncTy, typename CallTy>
953bool allocTypesMatchClone(
954 const std::vector<uint8_t> &InAllocTypes,
955 const ContextNode<DerivedCCG, FuncTy, CallTy> *Clone) {
956 const ContextNode<DerivedCCG, FuncTy, CallTy> *Node = Clone->CloneOf;
957 assert(Node);
958 // InAllocTypes should have been computed for the original node's callee
959 // edges.
960 assert(InAllocTypes.size() == Node->CalleeEdges.size());
961 // First create a map of the clone callee edge callees to the edge alloc type.
963 EdgeCalleeMap;
964 for (const auto &E : Clone->CalleeEdges) {
965 assert(!EdgeCalleeMap.contains(E->Callee));
966 EdgeCalleeMap[E->Callee] = E->AllocTypes;
967 }
968 // Next, walk the original node's callees, and look for the corresponding
969 // clone edge to that callee.
970 for (unsigned I = 0; I < Node->CalleeEdges.size(); I++) {
971 auto Iter = EdgeCalleeMap.find(Node->CalleeEdges[I]->Callee);
972 // Not found is ok, we will simply add an edge if we use this clone.
973 if (Iter == EdgeCalleeMap.end())
974 continue;
975 // Can share if one of the edges is None type - don't
976 // care about the type along that edge as it doesn't
977 // exist for those context ids.
978 if (InAllocTypes[I] == (uint8_t)AllocationType::None ||
979 Iter->second == (uint8_t)AllocationType::None)
980 continue;
981 if (allocTypeToUse(Iter->second) != allocTypeToUse(InAllocTypes[I]))
982 return false;
983 }
984 return true;
985}
986
987} // end anonymous namespace
988
989template <typename DerivedCCG, typename FuncTy, typename CallTy>
990typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode *
991CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::getNodeForInst(
992 const CallInfo &C) {
993 ContextNode *Node = getNodeForAlloc(C);
994 if (Node)
995 return Node;
996
997 return NonAllocationCallToContextNodeMap.lookup(C);
998}
999
1000template <typename DerivedCCG, typename FuncTy, typename CallTy>
1001typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode *
1002CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::getNodeForAlloc(
1003 const CallInfo &C) {
1004 return AllocationCallToContextNodeMap.lookup(C);
1005}
1006
1007template <typename DerivedCCG, typename FuncTy, typename CallTy>
1008typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode *
1009CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::getNodeForStackId(
1010 uint64_t StackId) {
1011 auto StackEntryNode = StackEntryIdToContextNodeMap.find(StackId);
1012 if (StackEntryNode != StackEntryIdToContextNodeMap.end())
1013 return StackEntryNode->second;
1014 return nullptr;
1015}
1016
1017template <typename DerivedCCG, typename FuncTy, typename CallTy>
1018void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::
1019 addOrUpdateCallerEdge(ContextNode *Caller, AllocationType AllocType,
1020 unsigned int ContextId) {
1021 for (auto &Edge : CallerEdges) {
1022 if (Edge->Caller == Caller) {
1023 Edge->AllocTypes |= (uint8_t)AllocType;
1024 Edge->getContextIds().insert(ContextId);
1025 return;
1026 }
1027 }
1028 std::shared_ptr<ContextEdge> Edge = std::make_shared<ContextEdge>(
1029 this, Caller, (uint8_t)AllocType, DenseSet<uint32_t>({ContextId}));
1030 CallerEdges.push_back(Edge);
1031 Caller->CalleeEdges.push_back(Edge);
1032}
1033
1034template <typename DerivedCCG, typename FuncTy, typename CallTy>
1035void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::removeEdgeFromGraph(
1036 ContextEdge *Edge, EdgeIter *EI, bool CalleeIter) {
1037 assert(!EI || (*EI)->get() == Edge);
1038 // Save the Caller and Callee pointers so we can erase Edge from their edge
1039 // lists after clearing Edge below. We do the clearing first in case it is
1040 // destructed after removing from the edge lists (if those were the last
1041 // shared_ptr references to Edge).
1042 auto *Callee = Edge->Callee;
1043 auto *Caller = Edge->Caller;
1044
1045 // Make sure the edge fields are cleared out so we can properly detect
1046 // removed edges if Edge is not destructed because there is still a shared_ptr
1047 // reference.
1048 Edge->clear();
1049
1050 if (!EI) {
1051 Callee->eraseCallerEdge(Edge);
1052 Caller->eraseCalleeEdge(Edge);
1053 } else if (CalleeIter) {
1054 Callee->eraseCallerEdge(Edge);
1055 *EI = Caller->CalleeEdges.erase(*EI);
1056 } else {
1057 Caller->eraseCalleeEdge(Edge);
1058 *EI = Callee->CallerEdges.erase(*EI);
1059 }
1060}
1061
1062template <typename DerivedCCG, typename FuncTy, typename CallTy>
1063void CallsiteContextGraph<
1064 DerivedCCG, FuncTy, CallTy>::removeNoneTypeCalleeEdges(ContextNode *Node) {
1065 for (auto EI = Node->CalleeEdges.begin(); EI != Node->CalleeEdges.end();) {
1066 auto Edge = *EI;
1067 if (Edge->AllocTypes == (uint8_t)AllocationType::None) {
1068 assert(Edge->ContextIds.empty());
1069 removeEdgeFromGraph(Edge.get(), &EI, /*CalleeIter=*/true);
1070 } else
1071 ++EI;
1072 }
1073}
1074
1075template <typename DerivedCCG, typename FuncTy, typename CallTy>
1076void CallsiteContextGraph<
1077 DerivedCCG, FuncTy, CallTy>::removeNoneTypeCallerEdges(ContextNode *Node) {
1078 for (auto EI = Node->CallerEdges.begin(); EI != Node->CallerEdges.end();) {
1079 auto Edge = *EI;
1080 if (Edge->AllocTypes == (uint8_t)AllocationType::None) {
1081 assert(Edge->ContextIds.empty());
1082 Edge->Caller->eraseCalleeEdge(Edge.get());
1083 EI = Node->CallerEdges.erase(EI);
1084 } else
1085 ++EI;
1086 }
1087}
1088
1089template <typename DerivedCCG, typename FuncTy, typename CallTy>
1090typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextEdge *
1091CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::
1092 findEdgeFromCallee(const ContextNode *Callee) {
1093 for (const auto &Edge : CalleeEdges)
1094 if (Edge->Callee == Callee)
1095 return Edge.get();
1096 return nullptr;
1097}
1098
1099template <typename DerivedCCG, typename FuncTy, typename CallTy>
1100typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextEdge *
1101CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::
1102 findEdgeFromCaller(const ContextNode *Caller) {
1103 for (const auto &Edge : CallerEdges)
1104 if (Edge->Caller == Caller)
1105 return Edge.get();
1106 return nullptr;
1107}
1108
1109template <typename DerivedCCG, typename FuncTy, typename CallTy>
1110void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::
1111 eraseCalleeEdge(const ContextEdge *Edge) {
1112 auto EI = llvm::find_if(
1113 CalleeEdges, [Edge](const std::shared_ptr<ContextEdge> &CalleeEdge) {
1114 return CalleeEdge.get() == Edge;
1115 });
1116 assert(EI != CalleeEdges.end());
1117 CalleeEdges.erase(EI);
1118}
1119
1120template <typename DerivedCCG, typename FuncTy, typename CallTy>
1121void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::
1122 eraseCallerEdge(const ContextEdge *Edge) {
1123 auto EI = llvm::find_if(
1124 CallerEdges, [Edge](const std::shared_ptr<ContextEdge> &CallerEdge) {
1125 return CallerEdge.get() == Edge;
1126 });
1127 assert(EI != CallerEdges.end());
1128 CallerEdges.erase(EI);
1129}
1130
1131template <typename DerivedCCG, typename FuncTy, typename CallTy>
1132uint8_t CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::computeAllocType(
1133 DenseSet<uint32_t> &ContextIds) {
1134 uint8_t BothTypes =
1135 (uint8_t)AllocationType::Cold | (uint8_t)AllocationType::NotCold;
1136 uint8_t AllocType = (uint8_t)AllocationType::None;
1137 for (auto Id : ContextIds) {
1138 AllocType |= (uint8_t)ContextIdToAllocationType[Id];
1139 // Bail early if alloc type reached both, no further refinement.
1140 if (AllocType == BothTypes)
1141 return AllocType;
1142 }
1143 return AllocType;
1144}
1145
1146template <typename DerivedCCG, typename FuncTy, typename CallTy>
1147uint8_t
1148CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::intersectAllocTypesImpl(
1149 const DenseSet<uint32_t> &Node1Ids, const DenseSet<uint32_t> &Node2Ids) {
1150 uint8_t BothTypes =
1151 (uint8_t)AllocationType::Cold | (uint8_t)AllocationType::NotCold;
1152 uint8_t AllocType = (uint8_t)AllocationType::None;
1153 for (auto Id : Node1Ids) {
1154 if (!Node2Ids.count(Id))
1155 continue;
1156 AllocType |= (uint8_t)ContextIdToAllocationType[Id];
1157 // Bail early if alloc type reached both, no further refinement.
1158 if (AllocType == BothTypes)
1159 return AllocType;
1160 }
1161 return AllocType;
1162}
1163
1164template <typename DerivedCCG, typename FuncTy, typename CallTy>
1165uint8_t CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::intersectAllocTypes(
1166 const DenseSet<uint32_t> &Node1Ids, const DenseSet<uint32_t> &Node2Ids) {
1167 if (Node1Ids.size() < Node2Ids.size())
1168 return intersectAllocTypesImpl(Node1Ids, Node2Ids);
1169 else
1170 return intersectAllocTypesImpl(Node2Ids, Node1Ids);
1171}
1172
1173template <typename DerivedCCG, typename FuncTy, typename CallTy>
1174typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode *
1175CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::addAllocNode(
1176 CallInfo Call, const FuncTy *F) {
1177 assert(!getNodeForAlloc(Call));
1178 ContextNode *AllocNode = createNewNode(/*IsAllocation=*/true, F, Call);
1179 AllocationCallToContextNodeMap[Call] = AllocNode;
1180 // Use LastContextId as a uniq id for MIB allocation nodes.
1181 AllocNode->OrigStackOrAllocId = LastContextId;
1182 // Alloc type should be updated as we add in the MIBs. We should assert
1183 // afterwards that it is not still None.
1184 AllocNode->AllocTypes = (uint8_t)AllocationType::None;
1185
1186 return AllocNode;
1187}
1188
1189static std::string getAllocTypeString(uint8_t AllocTypes) {
1190 if (!AllocTypes)
1191 return "None";
1192 std::string Str;
1193 if (AllocTypes & (uint8_t)AllocationType::NotCold)
1194 Str += "NotCold";
1195 if (AllocTypes & (uint8_t)AllocationType::Cold)
1196 Str += "Cold";
1197 return Str;
1198}
1199
1200template <typename DerivedCCG, typename FuncTy, typename CallTy>
1201template <class NodeT, class IteratorT>
1202void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::addStackNodesForMIB(
1203 ContextNode *AllocNode, CallStack<NodeT, IteratorT> &StackContext,
1205 ArrayRef<ContextTotalSize> ContextSizeInfo) {
1206 // Treating the hot alloc type as NotCold before the disambiguation for "hot"
1207 // is done.
1208 if (AllocType == AllocationType::Hot)
1209 AllocType = AllocationType::NotCold;
1210
1211 ContextIdToAllocationType[++LastContextId] = AllocType;
1212
1213 if (!ContextSizeInfo.empty()) {
1214 auto &Entry = ContextIdToContextSizeInfos[LastContextId];
1215 Entry.insert(Entry.begin(), ContextSizeInfo.begin(), ContextSizeInfo.end());
1216 }
1217
1218 // Update alloc type and context ids for this MIB.
1219 AllocNode->AllocTypes |= (uint8_t)AllocType;
1220
1221 // Now add or update nodes for each stack id in alloc's context.
1222 // Later when processing the stack ids on non-alloc callsites we will adjust
1223 // for any inlining in the context.
1224 ContextNode *PrevNode = AllocNode;
1225 // Look for recursion (direct recursion should have been collapsed by
1226 // module summary analysis, here we should just be detecting mutual
1227 // recursion). Mark these nodes so we don't try to clone.
1228 SmallSet<uint64_t, 8> StackIdSet;
1229 // Skip any on the allocation call (inlining).
1230 for (auto ContextIter = StackContext.beginAfterSharedPrefix(CallsiteContext);
1231 ContextIter != StackContext.end(); ++ContextIter) {
1232 auto StackId = getStackId(*ContextIter);
1233 ContextNode *StackNode = getNodeForStackId(StackId);
1234 if (!StackNode) {
1235 StackNode = createNewNode(/*IsAllocation=*/false);
1236 StackEntryIdToContextNodeMap[StackId] = StackNode;
1237 StackNode->OrigStackOrAllocId = StackId;
1238 }
1239 auto Ins = StackIdSet.insert(StackId);
1240 if (!Ins.second)
1241 StackNode->Recursive = true;
1242 StackNode->AllocTypes |= (uint8_t)AllocType;
1243 PrevNode->addOrUpdateCallerEdge(StackNode, AllocType, LastContextId);
1244 PrevNode = StackNode;
1245 }
1246}
1247
1248template <typename DerivedCCG, typename FuncTy, typename CallTy>
1250CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::duplicateContextIds(
1251 const DenseSet<uint32_t> &StackSequenceContextIds,
1252 DenseMap<uint32_t, DenseSet<uint32_t>> &OldToNewContextIds) {
1253 DenseSet<uint32_t> NewContextIds;
1254 for (auto OldId : StackSequenceContextIds) {
1255 NewContextIds.insert(++LastContextId);
1256 OldToNewContextIds[OldId].insert(LastContextId);
1257 assert(ContextIdToAllocationType.count(OldId));
1258 // The new context has the same allocation type as original.
1259 ContextIdToAllocationType[LastContextId] = ContextIdToAllocationType[OldId];
1260 }
1261 return NewContextIds;
1262}
1263
1264template <typename DerivedCCG, typename FuncTy, typename CallTy>
1265void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
1266 propagateDuplicateContextIds(
1267 const DenseMap<uint32_t, DenseSet<uint32_t>> &OldToNewContextIds) {
1268 // Build a set of duplicated context ids corresponding to the input id set.
1269 auto GetNewIds = [&OldToNewContextIds](const DenseSet<uint32_t> &ContextIds) {
1270 DenseSet<uint32_t> NewIds;
1271 for (auto Id : ContextIds)
1272 if (auto NewId = OldToNewContextIds.find(Id);
1273 NewId != OldToNewContextIds.end())
1274 NewIds.insert(NewId->second.begin(), NewId->second.end());
1275 return NewIds;
1276 };
1277
1278 // Recursively update context ids sets along caller edges.
1279 auto UpdateCallers = [&](ContextNode *Node,
1281 auto &&UpdateCallers) -> void {
1282 for (const auto &Edge : Node->CallerEdges) {
1283 auto Inserted = Visited.insert(Edge.get());
1284 if (!Inserted.second)
1285 continue;
1286 ContextNode *NextNode = Edge->Caller;
1287 DenseSet<uint32_t> NewIdsToAdd = GetNewIds(Edge->getContextIds());
1288 // Only need to recursively iterate to NextNode via this caller edge if
1289 // it resulted in any added ids to NextNode.
1290 if (!NewIdsToAdd.empty()) {
1291 Edge->getContextIds().insert(NewIdsToAdd.begin(), NewIdsToAdd.end());
1292 UpdateCallers(NextNode, Visited, UpdateCallers);
1293 }
1294 }
1295 };
1296
1298 for (auto &Entry : AllocationCallToContextNodeMap) {
1299 auto *Node = Entry.second;
1300 UpdateCallers(Node, Visited, UpdateCallers);
1301 }
1302}
1303
1304template <typename DerivedCCG, typename FuncTy, typename CallTy>
1305void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::connectNewNode(
1306 ContextNode *NewNode, ContextNode *OrigNode, bool TowardsCallee,
1307 // This must be passed by value to make a copy since it will be adjusted
1308 // as ids are moved.
1309 DenseSet<uint32_t> RemainingContextIds) {
1310 auto &OrigEdges =
1311 TowardsCallee ? OrigNode->CalleeEdges : OrigNode->CallerEdges;
1312 // Increment iterator in loop so that we can remove edges as needed.
1313 for (auto EI = OrigEdges.begin(); EI != OrigEdges.end();) {
1314 auto Edge = *EI;
1315 // Remove any matching context ids from Edge, return set that were found and
1316 // removed, these are the new edge's context ids. Also update the remaining
1317 // (not found ids).
1318 DenseSet<uint32_t> NewEdgeContextIds, NotFoundContextIds;
1319 set_subtract(Edge->getContextIds(), RemainingContextIds, NewEdgeContextIds,
1320 NotFoundContextIds);
1321 RemainingContextIds.swap(NotFoundContextIds);
1322 // If no matching context ids for this edge, skip it.
1323 if (NewEdgeContextIds.empty()) {
1324 ++EI;
1325 continue;
1326 }
1327 if (TowardsCallee) {
1328 uint8_t NewAllocType = computeAllocType(NewEdgeContextIds);
1329 auto NewEdge = std::make_shared<ContextEdge>(
1330 Edge->Callee, NewNode, NewAllocType, std::move(NewEdgeContextIds));
1331 NewNode->CalleeEdges.push_back(NewEdge);
1332 NewEdge->Callee->CallerEdges.push_back(NewEdge);
1333 } else {
1334 uint8_t NewAllocType = computeAllocType(NewEdgeContextIds);
1335 auto NewEdge = std::make_shared<ContextEdge>(
1336 NewNode, Edge->Caller, NewAllocType, std::move(NewEdgeContextIds));
1337 NewNode->CallerEdges.push_back(NewEdge);
1338 NewEdge->Caller->CalleeEdges.push_back(NewEdge);
1339 }
1340 // Remove old edge if context ids empty.
1341 if (Edge->getContextIds().empty()) {
1342 removeEdgeFromGraph(Edge.get(), &EI, TowardsCallee);
1343 continue;
1344 }
1345 ++EI;
1346 }
1347}
1348
1349template <typename DerivedCCG, typename FuncTy, typename CallTy>
1350static void checkEdge(
1351 const std::shared_ptr<ContextEdge<DerivedCCG, FuncTy, CallTy>> &Edge) {
1352 // Confirm that alloc type is not None and that we have at least one context
1353 // id.
1354 assert(Edge->AllocTypes != (uint8_t)AllocationType::None);
1355 assert(!Edge->ContextIds.empty());
1356}
1357
1358template <typename DerivedCCG, typename FuncTy, typename CallTy>
1359static void checkNode(const ContextNode<DerivedCCG, FuncTy, CallTy> *Node,
1360 bool CheckEdges = true) {
1361 if (Node->isRemoved())
1362 return;
1363#ifndef NDEBUG
1364 // Compute node's context ids once for use in asserts.
1365 auto NodeContextIds = Node->getContextIds();
1366#endif
1367 // Node's context ids should be the union of both its callee and caller edge
1368 // context ids.
1369 if (Node->CallerEdges.size()) {
1370 DenseSet<uint32_t> CallerEdgeContextIds(
1371 Node->CallerEdges.front()->ContextIds);
1372 for (const auto &Edge : llvm::drop_begin(Node->CallerEdges)) {
1373 if (CheckEdges)
1374 checkEdge<DerivedCCG, FuncTy, CallTy>(Edge);
1375 set_union(CallerEdgeContextIds, Edge->ContextIds);
1376 }
1377 // Node can have more context ids than callers if some contexts terminate at
1378 // node and some are longer.
1379 assert(NodeContextIds == CallerEdgeContextIds ||
1380 set_is_subset(CallerEdgeContextIds, NodeContextIds));
1381 }
1382 if (Node->CalleeEdges.size()) {
1383 DenseSet<uint32_t> CalleeEdgeContextIds(
1384 Node->CalleeEdges.front()->ContextIds);
1385 for (const auto &Edge : llvm::drop_begin(Node->CalleeEdges)) {
1386 if (CheckEdges)
1387 checkEdge<DerivedCCG, FuncTy, CallTy>(Edge);
1388 set_union(CalleeEdgeContextIds, Edge->getContextIds());
1389 }
1390 assert(NodeContextIds == CalleeEdgeContextIds);
1391 }
1392 // FIXME: Since this checking is only invoked under an option, we should
1393 // change the error checking from using assert to something that will trigger
1394 // an error on a release build.
1395#ifndef NDEBUG
1396 // Make sure we don't end up with duplicate edges between the same caller and
1397 // callee.
1399 for (const auto &E : Node->CalleeEdges)
1400 NodeSet.insert(E->Callee);
1401 assert(NodeSet.size() == Node->CalleeEdges.size());
1402#endif
1403}
1404
1405template <typename DerivedCCG, typename FuncTy, typename CallTy>
1406void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
1407 assignStackNodesPostOrder(
1408 ContextNode *Node, DenseSet<const ContextNode *> &Visited,
1409 DenseMap<uint64_t, std::vector<CallContextInfo>>
1410 &StackIdToMatchingCalls,
1411 DenseMap<CallInfo, CallInfo> &CallToMatchingCall) {
1412 auto Inserted = Visited.insert(Node);
1413 if (!Inserted.second)
1414 return;
1415 // Post order traversal. Iterate over a copy since we may add nodes and
1416 // therefore new callers during the recursive call, invalidating any
1417 // iterator over the original edge vector. We don't need to process these
1418 // new nodes as they were already processed on creation.
1419 auto CallerEdges = Node->CallerEdges;
1420 for (auto &Edge : CallerEdges) {
1421 // Skip any that have been removed during the recursion.
1422 if (Edge->isRemoved()) {
1423 assert(!is_contained(Node->CallerEdges, Edge));
1424 continue;
1425 }
1426 assignStackNodesPostOrder(Edge->Caller, Visited, StackIdToMatchingCalls,
1427 CallToMatchingCall);
1428 }
1429
1430 // If this node's stack id is in the map, update the graph to contain new
1431 // nodes representing any inlining at interior callsites. Note we move the
1432 // associated context ids over to the new nodes.
1433
1434 // Ignore this node if it is for an allocation or we didn't record any
1435 // stack id lists ending at it.
1436 if (Node->IsAllocation ||
1437 !StackIdToMatchingCalls.count(Node->OrigStackOrAllocId))
1438 return;
1439
1440 auto &Calls = StackIdToMatchingCalls[Node->OrigStackOrAllocId];
1441 // Handle the simple case first. A single call with a single stack id.
1442 // In this case there is no need to create any new context nodes, simply
1443 // assign the context node for stack id to this Call.
1444 if (Calls.size() == 1) {
1445 auto &[Call, Ids, Func, SavedContextIds] = Calls[0];
1446 if (Ids.size() == 1) {
1447 assert(SavedContextIds.empty());
1448 // It should be this Node
1449 assert(Node == getNodeForStackId(Ids[0]));
1450 if (Node->Recursive)
1451 return;
1452 Node->setCall(Call);
1453 NonAllocationCallToContextNodeMap[Call] = Node;
1454 NodeToCallingFunc[Node] = Func;
1455 return;
1456 }
1457 }
1458
1459#ifndef NDEBUG
1460 // Find the node for the last stack id, which should be the same
1461 // across all calls recorded for this id, and is this node's id.
1462 uint64_t LastId = Node->OrigStackOrAllocId;
1463 ContextNode *LastNode = getNodeForStackId(LastId);
1464 // We should only have kept stack ids that had nodes.
1465 assert(LastNode);
1466 assert(LastNode == Node);
1467#else
1468 ContextNode *LastNode = Node;
1469#endif
1470
1471 // Compute the last node's context ids once, as it is shared by all calls in
1472 // this entry.
1473 DenseSet<uint32_t> LastNodeContextIds = LastNode->getContextIds();
1474
1475 bool PrevIterCreatedNode = false;
1476 bool CreatedNode = false;
1477 for (unsigned I = 0; I < Calls.size();
1478 I++, PrevIterCreatedNode = CreatedNode) {
1479 CreatedNode = false;
1480 auto &[Call, Ids, Func, SavedContextIds] = Calls[I];
1481 // Skip any for which we didn't assign any ids, these don't get a node in
1482 // the graph.
1483 if (SavedContextIds.empty()) {
1484 // If this call has a matching call (located in the same function and
1485 // having the same stack ids), simply add it to the context node created
1486 // for its matching call earlier. These can be treated the same through
1487 // cloning and get updated at the same time.
1488 if (!CallToMatchingCall.contains(Call))
1489 continue;
1490 auto MatchingCall = CallToMatchingCall[Call];
1491 if (!NonAllocationCallToContextNodeMap.contains(MatchingCall)) {
1492 // This should only happen if we had a prior iteration, and it didn't
1493 // create a node because of the below recomputation of context ids
1494 // finding none remaining and continuing early.
1495 assert(I > 0 && !PrevIterCreatedNode);
1496 continue;
1497 }
1498 NonAllocationCallToContextNodeMap[MatchingCall]->MatchingCalls.push_back(
1499 Call);
1500 continue;
1501 }
1502
1503 assert(LastId == Ids.back());
1504
1505 // Recompute the context ids for this stack id sequence (the
1506 // intersection of the context ids of the corresponding nodes).
1507 // Start with the ids we saved in the map for this call, which could be
1508 // duplicated context ids. We have to recompute as we might have overlap
1509 // overlap between the saved context ids for different last nodes, and
1510 // removed them already during the post order traversal.
1511 set_intersect(SavedContextIds, LastNodeContextIds);
1512 ContextNode *PrevNode = LastNode;
1513 bool Skip = false;
1514 // Iterate backwards through the stack Ids, starting after the last Id
1515 // in the list, which was handled once outside for all Calls.
1516 for (auto IdIter = Ids.rbegin() + 1; IdIter != Ids.rend(); IdIter++) {
1517 auto Id = *IdIter;
1518 ContextNode *CurNode = getNodeForStackId(Id);
1519 // We should only have kept stack ids that had nodes and weren't
1520 // recursive.
1521 assert(CurNode);
1522 assert(!CurNode->Recursive);
1523
1524 auto *Edge = CurNode->findEdgeFromCaller(PrevNode);
1525 if (!Edge) {
1526 Skip = true;
1527 break;
1528 }
1529 PrevNode = CurNode;
1530
1531 // Update the context ids, which is the intersection of the ids along
1532 // all edges in the sequence.
1533 set_intersect(SavedContextIds, Edge->getContextIds());
1534
1535 // If we now have no context ids for clone, skip this call.
1536 if (SavedContextIds.empty()) {
1537 Skip = true;
1538 break;
1539 }
1540 }
1541 if (Skip)
1542 continue;
1543
1544 // Create new context node.
1545 ContextNode *NewNode = createNewNode(/*IsAllocation=*/false, Func, Call);
1546 NonAllocationCallToContextNodeMap[Call] = NewNode;
1547 CreatedNode = true;
1548 NewNode->AllocTypes = computeAllocType(SavedContextIds);
1549
1550 ContextNode *FirstNode = getNodeForStackId(Ids[0]);
1551 assert(FirstNode);
1552
1553 // Connect to callees of innermost stack frame in inlined call chain.
1554 // This updates context ids for FirstNode's callee's to reflect those
1555 // moved to NewNode.
1556 connectNewNode(NewNode, FirstNode, /*TowardsCallee=*/true, SavedContextIds);
1557
1558 // Connect to callers of outermost stack frame in inlined call chain.
1559 // This updates context ids for FirstNode's caller's to reflect those
1560 // moved to NewNode.
1561 connectNewNode(NewNode, LastNode, /*TowardsCallee=*/false, SavedContextIds);
1562
1563 // Now we need to remove context ids from edges/nodes between First and
1564 // Last Node.
1565 PrevNode = nullptr;
1566 for (auto Id : Ids) {
1567 ContextNode *CurNode = getNodeForStackId(Id);
1568 // We should only have kept stack ids that had nodes.
1569 assert(CurNode);
1570
1571 // Remove the context ids moved to NewNode from CurNode, and the
1572 // edge from the prior node.
1573 if (PrevNode) {
1574 auto *PrevEdge = CurNode->findEdgeFromCallee(PrevNode);
1575 assert(PrevEdge);
1576 set_subtract(PrevEdge->getContextIds(), SavedContextIds);
1577 if (PrevEdge->getContextIds().empty())
1578 removeEdgeFromGraph(PrevEdge);
1579 }
1580 // Since we update the edges from leaf to tail, only look at the callee
1581 // edges. This isn't an alloc node, so if there are no callee edges, the
1582 // alloc type is None.
1583 CurNode->AllocTypes = CurNode->CalleeEdges.empty()
1584 ? (uint8_t)AllocationType::None
1585 : CurNode->computeAllocType();
1586 PrevNode = CurNode;
1587 }
1588 if (VerifyNodes) {
1589 checkNode<DerivedCCG, FuncTy, CallTy>(NewNode, /*CheckEdges=*/true);
1590 for (auto Id : Ids) {
1591 ContextNode *CurNode = getNodeForStackId(Id);
1592 // We should only have kept stack ids that had nodes.
1593 assert(CurNode);
1594 checkNode<DerivedCCG, FuncTy, CallTy>(CurNode, /*CheckEdges=*/true);
1595 }
1596 }
1597 }
1598}
1599
1600template <typename DerivedCCG, typename FuncTy, typename CallTy>
1601void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::updateStackNodes() {
1602 // Map of stack id to all calls with that as the last (outermost caller)
1603 // callsite id that has a context node (some might not due to pruning
1604 // performed during matching of the allocation profile contexts).
1605 // The CallContextInfo contains the Call and a list of its stack ids with
1606 // ContextNodes, the function containing Call, and the set of context ids
1607 // the analysis will eventually identify for use in any new node created
1608 // for that callsite.
1610 for (auto &[Func, CallsWithMetadata] : FuncToCallsWithMetadata) {
1611 for (auto &Call : CallsWithMetadata) {
1612 // Ignore allocations, already handled.
1613 if (AllocationCallToContextNodeMap.count(Call))
1614 continue;
1615 auto StackIdsWithContextNodes =
1616 getStackIdsWithContextNodesForCall(Call.call());
1617 // If there were no nodes created for MIBs on allocs (maybe this was in
1618 // the unambiguous part of the MIB stack that was pruned), ignore.
1619 if (StackIdsWithContextNodes.empty())
1620 continue;
1621 // Otherwise, record this Call along with the list of ids for the last
1622 // (outermost caller) stack id with a node.
1623 StackIdToMatchingCalls[StackIdsWithContextNodes.back()].push_back(
1624 {Call.call(), StackIdsWithContextNodes, Func, {}});
1625 }
1626 }
1627
1628 // First make a pass through all stack ids that correspond to a call,
1629 // as identified in the above loop. Compute the context ids corresponding to
1630 // each of these calls when they correspond to multiple stack ids due to
1631 // due to inlining. Perform any duplication of context ids required when
1632 // there is more than one call with the same stack ids. Their (possibly newly
1633 // duplicated) context ids are saved in the StackIdToMatchingCalls map.
1634 DenseMap<uint32_t, DenseSet<uint32_t>> OldToNewContextIds;
1635 // Save a map from each call to any that are found to match it. I.e. located
1636 // in the same function and have the same (possibly pruned) stack ids. We use
1637 // this to avoid creating extra graph nodes as they can be treated the same.
1638 DenseMap<CallInfo, CallInfo> CallToMatchingCall;
1639 for (auto &It : StackIdToMatchingCalls) {
1640 auto &Calls = It.getSecond();
1641 // Skip single calls with a single stack id. These don't need a new node.
1642 if (Calls.size() == 1) {
1643 auto &Ids = Calls[0].StackIds;
1644 if (Ids.size() == 1)
1645 continue;
1646 }
1647 // In order to do the best and maximal matching of inlined calls to context
1648 // node sequences we will sort the vectors of stack ids in descending order
1649 // of length, and within each length, lexicographically by stack id. The
1650 // latter is so that we can specially handle calls that have identical stack
1651 // id sequences (either due to cloning or artificially because of the MIB
1652 // context pruning). Those with the same Ids are then sorted by function to
1653 // facilitate efficiently mapping them to the same context node.
1654 // Because the functions are pointers, to ensure a stable sort first assign
1655 // each function pointer to its first index in the Calls array, and then use
1656 // that to sort by.
1658 for (const auto &[Idx, CallCtxInfo] : enumerate(Calls))
1659 FuncToIndex.insert({CallCtxInfo.Func, Idx});
1660 std::stable_sort(
1661 Calls.begin(), Calls.end(),
1662 [&FuncToIndex](const CallContextInfo &A, const CallContextInfo &B) {
1663 return A.StackIds.size() > B.StackIds.size() ||
1664 (A.StackIds.size() == B.StackIds.size() &&
1665 (A.StackIds < B.StackIds ||
1666 (A.StackIds == B.StackIds &&
1667 FuncToIndex[A.Func] < FuncToIndex[B.Func])));
1668 });
1669
1670 // Find the node for the last stack id, which should be the same
1671 // across all calls recorded for this id, and is the id for this
1672 // entry in the StackIdToMatchingCalls map.
1673 uint64_t LastId = It.getFirst();
1674 ContextNode *LastNode = getNodeForStackId(LastId);
1675 // We should only have kept stack ids that had nodes.
1676 assert(LastNode);
1677
1678 if (LastNode->Recursive)
1679 continue;
1680
1681 // Initialize the context ids with the last node's. We will subsequently
1682 // refine the context ids by computing the intersection along all edges.
1683 DenseSet<uint32_t> LastNodeContextIds = LastNode->getContextIds();
1684 assert(!LastNodeContextIds.empty());
1685
1686#ifndef NDEBUG
1687 // Save the set of functions seen for a particular set of the same stack
1688 // ids. This is used to ensure that they have been correctly sorted to be
1689 // adjacent in the Calls list, since we rely on that to efficiently place
1690 // all such matching calls onto the same context node.
1691 DenseSet<const FuncTy *> MatchingIdsFuncSet;
1692#endif
1693
1694 for (unsigned I = 0; I < Calls.size(); I++) {
1695 auto &[Call, Ids, Func, SavedContextIds] = Calls[I];
1696 assert(SavedContextIds.empty());
1697 assert(LastId == Ids.back());
1698
1699#ifndef NDEBUG
1700 // If this call has a different set of ids than the last one, clear the
1701 // set used to ensure they are sorted properly.
1702 if (I > 0 && Ids != Calls[I - 1].StackIds)
1703 MatchingIdsFuncSet.clear();
1704#endif
1705
1706 // First compute the context ids for this stack id sequence (the
1707 // intersection of the context ids of the corresponding nodes).
1708 // Start with the remaining saved ids for the last node.
1709 assert(!LastNodeContextIds.empty());
1710 DenseSet<uint32_t> StackSequenceContextIds = LastNodeContextIds;
1711
1712 ContextNode *PrevNode = LastNode;
1713 ContextNode *CurNode = LastNode;
1714 bool Skip = false;
1715
1716 // Iterate backwards through the stack Ids, starting after the last Id
1717 // in the list, which was handled once outside for all Calls.
1718 for (auto IdIter = Ids.rbegin() + 1; IdIter != Ids.rend(); IdIter++) {
1719 auto Id = *IdIter;
1720 CurNode = getNodeForStackId(Id);
1721 // We should only have kept stack ids that had nodes.
1722 assert(CurNode);
1723
1724 if (CurNode->Recursive) {
1725 Skip = true;
1726 break;
1727 }
1728
1729 auto *Edge = CurNode->findEdgeFromCaller(PrevNode);
1730 // If there is no edge then the nodes belong to different MIB contexts,
1731 // and we should skip this inlined context sequence. For example, this
1732 // particular inlined context may include stack ids A->B, and we may
1733 // indeed have nodes for both A and B, but it is possible that they were
1734 // never profiled in sequence in a single MIB for any allocation (i.e.
1735 // we might have profiled an allocation that involves the callsite A,
1736 // but through a different one of its callee callsites, and we might
1737 // have profiled an allocation that involves callsite B, but reached
1738 // from a different caller callsite).
1739 if (!Edge) {
1740 Skip = true;
1741 break;
1742 }
1743 PrevNode = CurNode;
1744
1745 // Update the context ids, which is the intersection of the ids along
1746 // all edges in the sequence.
1747 set_intersect(StackSequenceContextIds, Edge->getContextIds());
1748
1749 // If we now have no context ids for clone, skip this call.
1750 if (StackSequenceContextIds.empty()) {
1751 Skip = true;
1752 break;
1753 }
1754 }
1755 if (Skip)
1756 continue;
1757
1758 // If some of this call's stack ids did not have corresponding nodes (due
1759 // to pruning), don't include any context ids for contexts that extend
1760 // beyond these nodes. Otherwise we would be matching part of unrelated /
1761 // not fully matching stack contexts. To do this, subtract any context ids
1762 // found in caller nodes of the last node found above.
1763 if (Ids.back() != getLastStackId(Call)) {
1764 for (const auto &PE : LastNode->CallerEdges) {
1765 set_subtract(StackSequenceContextIds, PE->getContextIds());
1766 if (StackSequenceContextIds.empty())
1767 break;
1768 }
1769 // If we now have no context ids for clone, skip this call.
1770 if (StackSequenceContextIds.empty())
1771 continue;
1772 }
1773
1774#ifndef NDEBUG
1775 // If the prior call had the same stack ids this set would not be empty.
1776 // Check if we already have a call that "matches" because it is located
1777 // in the same function. If the Calls list was sorted properly we should
1778 // not encounter this situation as all such entries should be adjacent
1779 // and processed in bulk further below.
1780 assert(!MatchingIdsFuncSet.contains(Func));
1781
1782 MatchingIdsFuncSet.insert(Func);
1783#endif
1784
1785 // Check if the next set of stack ids is the same (since the Calls vector
1786 // of tuples is sorted by the stack ids we can just look at the next one).
1787 // If so, save them in the CallToMatchingCall map so that they get
1788 // assigned to the same context node, and skip them.
1789 bool DuplicateContextIds = false;
1790 for (unsigned J = I + 1; J < Calls.size(); J++) {
1791 auto &CallCtxInfo = Calls[J];
1792 auto &NextIds = CallCtxInfo.StackIds;
1793 if (NextIds != Ids)
1794 break;
1795 auto *NextFunc = CallCtxInfo.Func;
1796 if (NextFunc != Func) {
1797 // We have another Call with the same ids but that cannot share this
1798 // node, must duplicate ids for it.
1799 DuplicateContextIds = true;
1800 break;
1801 }
1802 auto &NextCall = CallCtxInfo.Call;
1803 CallToMatchingCall[NextCall] = Call;
1804 // Update I so that it gets incremented correctly to skip this call.
1805 I = J;
1806 }
1807
1808 // If we don't have duplicate context ids, then we can assign all the
1809 // context ids computed for the original node sequence to this call.
1810 // If there are duplicate calls with the same stack ids then we synthesize
1811 // new context ids that are duplicates of the originals. These are
1812 // assigned to SavedContextIds, which is a reference into the map entry
1813 // for this call, allowing us to access these ids later on.
1814 OldToNewContextIds.reserve(OldToNewContextIds.size() +
1815 StackSequenceContextIds.size());
1816 SavedContextIds =
1817 DuplicateContextIds
1818 ? duplicateContextIds(StackSequenceContextIds, OldToNewContextIds)
1819 : StackSequenceContextIds;
1820 assert(!SavedContextIds.empty());
1821
1822 if (!DuplicateContextIds) {
1823 // Update saved last node's context ids to remove those that are
1824 // assigned to other calls, so that it is ready for the next call at
1825 // this stack id.
1826 set_subtract(LastNodeContextIds, StackSequenceContextIds);
1827 if (LastNodeContextIds.empty())
1828 break;
1829 }
1830 }
1831 }
1832
1833 // Propagate the duplicate context ids over the graph.
1834 propagateDuplicateContextIds(OldToNewContextIds);
1835
1836 if (VerifyCCG)
1837 check();
1838
1839 // Now perform a post-order traversal over the graph, starting with the
1840 // allocation nodes, essentially processing nodes from callers to callees.
1841 // For any that contains an id in the map, update the graph to contain new
1842 // nodes representing any inlining at interior callsites. Note we move the
1843 // associated context ids over to the new nodes.
1845 for (auto &Entry : AllocationCallToContextNodeMap)
1846 assignStackNodesPostOrder(Entry.second, Visited, StackIdToMatchingCalls,
1847 CallToMatchingCall);
1848 if (VerifyCCG)
1849 check();
1850}
1851
1852uint64_t ModuleCallsiteContextGraph::getLastStackId(Instruction *Call) {
1854 Call->getMetadata(LLVMContext::MD_callsite));
1855 return CallsiteContext.back();
1856}
1857
1858uint64_t IndexCallsiteContextGraph::getLastStackId(IndexCall &Call) {
1859 assert(isa<CallsiteInfo *>(Call.getBase()));
1861 CallsiteContext(dyn_cast_if_present<CallsiteInfo *>(Call.getBase()));
1862 // Need to convert index into stack id.
1863 return Index.getStackIdAtIndex(CallsiteContext.back());
1864}
1865
1866static const std::string MemProfCloneSuffix = ".memprof.";
1867
1868static std::string getMemProfFuncName(Twine Base, unsigned CloneNo) {
1869 // We use CloneNo == 0 to refer to the original version, which doesn't get
1870 // renamed with a suffix.
1871 if (!CloneNo)
1872 return Base.str();
1873 return (Base + MemProfCloneSuffix + Twine(CloneNo)).str();
1874}
1875
1876static bool isMemProfClone(const Function &F) {
1877 return F.getName().contains(MemProfCloneSuffix);
1878}
1879
1880std::string ModuleCallsiteContextGraph::getLabel(const Function *Func,
1881 const Instruction *Call,
1882 unsigned CloneNo) const {
1883 return (Twine(Call->getFunction()->getName()) + " -> " +
1884 cast<CallBase>(Call)->getCalledFunction()->getName())
1885 .str();
1886}
1887
1888std::string IndexCallsiteContextGraph::getLabel(const FunctionSummary *Func,
1889 const IndexCall &Call,
1890 unsigned CloneNo) const {
1891 auto VI = FSToVIMap.find(Func);
1892 assert(VI != FSToVIMap.end());
1893 if (isa<AllocInfo *>(Call.getBase()))
1894 return (VI->second.name() + " -> alloc").str();
1895 else {
1896 auto *Callsite = dyn_cast_if_present<CallsiteInfo *>(Call.getBase());
1897 return (VI->second.name() + " -> " +
1898 getMemProfFuncName(Callsite->Callee.name(),
1899 Callsite->Clones[CloneNo]))
1900 .str();
1901 }
1902}
1903
1904std::vector<uint64_t>
1905ModuleCallsiteContextGraph::getStackIdsWithContextNodesForCall(
1906 Instruction *Call) {
1908 Call->getMetadata(LLVMContext::MD_callsite));
1909 return getStackIdsWithContextNodes<MDNode, MDNode::op_iterator>(
1910 CallsiteContext);
1911}
1912
1913std::vector<uint64_t>
1914IndexCallsiteContextGraph::getStackIdsWithContextNodesForCall(IndexCall &Call) {
1915 assert(isa<CallsiteInfo *>(Call.getBase()));
1917 CallsiteContext(dyn_cast_if_present<CallsiteInfo *>(Call.getBase()));
1918 return getStackIdsWithContextNodes<CallsiteInfo,
1920 CallsiteContext);
1921}
1922
1923template <typename DerivedCCG, typename FuncTy, typename CallTy>
1924template <class NodeT, class IteratorT>
1925std::vector<uint64_t>
1926CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::getStackIdsWithContextNodes(
1927 CallStack<NodeT, IteratorT> &CallsiteContext) {
1928 std::vector<uint64_t> StackIds;
1929 for (auto IdOrIndex : CallsiteContext) {
1930 auto StackId = getStackId(IdOrIndex);
1931 ContextNode *Node = getNodeForStackId(StackId);
1932 if (!Node)
1933 break;
1934 StackIds.push_back(StackId);
1935 }
1936 return StackIds;
1937}
1938
1939ModuleCallsiteContextGraph::ModuleCallsiteContextGraph(
1940 Module &M,
1942 : Mod(M), OREGetter(OREGetter) {
1943 for (auto &F : M) {
1944 std::vector<CallInfo> CallsWithMetadata;
1945 for (auto &BB : F) {
1946 for (auto &I : BB) {
1947 if (!isa<CallBase>(I))
1948 continue;
1949 if (auto *MemProfMD = I.getMetadata(LLVMContext::MD_memprof)) {
1950 CallsWithMetadata.push_back(&I);
1951 auto *AllocNode = addAllocNode(&I, &F);
1952 auto *CallsiteMD = I.getMetadata(LLVMContext::MD_callsite);
1953 assert(CallsiteMD);
1954 CallStack<MDNode, MDNode::op_iterator> CallsiteContext(CallsiteMD);
1955 // Add all of the MIBs and their stack nodes.
1956 for (auto &MDOp : MemProfMD->operands()) {
1957 auto *MIBMD = cast<const MDNode>(MDOp);
1958 std::vector<ContextTotalSize> ContextSizeInfo;
1959 // Collect the context size information if it exists.
1960 if (MIBMD->getNumOperands() > 2) {
1961 for (unsigned I = 2; I < MIBMD->getNumOperands(); I++) {
1962 MDNode *ContextSizePair =
1963 dyn_cast<MDNode>(MIBMD->getOperand(I));
1964 assert(ContextSizePair->getNumOperands() == 2);
1965 uint64_t FullStackId = mdconst::dyn_extract<ConstantInt>(
1966 ContextSizePair->getOperand(0))
1967 ->getZExtValue();
1968 uint64_t TotalSize = mdconst::dyn_extract<ConstantInt>(
1969 ContextSizePair->getOperand(1))
1970 ->getZExtValue();
1971 ContextSizeInfo.push_back({FullStackId, TotalSize});
1972 }
1973 }
1977 addStackNodesForMIB<MDNode, MDNode::op_iterator>(
1978 AllocNode, StackContext, CallsiteContext,
1979 getMIBAllocType(MIBMD), ContextSizeInfo);
1980 }
1981 assert(AllocNode->AllocTypes != (uint8_t)AllocationType::None);
1982 // Memprof and callsite metadata on memory allocations no longer
1983 // needed.
1984 I.setMetadata(LLVMContext::MD_memprof, nullptr);
1985 I.setMetadata(LLVMContext::MD_callsite, nullptr);
1986 }
1987 // For callsite metadata, add to list for this function for later use.
1988 else if (I.getMetadata(LLVMContext::MD_callsite)) {
1989 CallsWithMetadata.push_back(&I);
1990 }
1991 }
1992 }
1993 if (!CallsWithMetadata.empty())
1994 FuncToCallsWithMetadata[&F] = CallsWithMetadata;
1995 }
1996
1997 if (DumpCCG) {
1998 dbgs() << "CCG before updating call stack chains:\n";
1999 dbgs() << *this;
2000 }
2001
2002 if (ExportToDot)
2003 exportToDot("prestackupdate");
2004
2005 updateStackNodes();
2006
2007 handleCallsitesWithMultipleTargets();
2008
2009 // Strip off remaining callsite metadata, no longer needed.
2010 for (auto &FuncEntry : FuncToCallsWithMetadata)
2011 for (auto &Call : FuncEntry.second)
2012 Call.call()->setMetadata(LLVMContext::MD_callsite, nullptr);
2013}
2014
2015IndexCallsiteContextGraph::IndexCallsiteContextGraph(
2016 ModuleSummaryIndex &Index,
2018 isPrevailing)
2019 : Index(Index), isPrevailing(isPrevailing) {
2020 for (auto &I : Index) {
2021 auto VI = Index.getValueInfo(I);
2022 for (auto &S : VI.getSummaryList()) {
2023 // We should only add the prevailing nodes. Otherwise we may try to clone
2024 // in a weak copy that won't be linked (and may be different than the
2025 // prevailing version).
2026 // We only keep the memprof summary on the prevailing copy now when
2027 // building the combined index, as a space optimization, however don't
2028 // rely on this optimization. The linker doesn't resolve local linkage
2029 // values so don't check whether those are prevailing.
2030 if (!GlobalValue::isLocalLinkage(S->linkage()) &&
2031 !isPrevailing(VI.getGUID(), S.get()))
2032 continue;
2033 auto *FS = dyn_cast<FunctionSummary>(S.get());
2034 if (!FS)
2035 continue;
2036 std::vector<CallInfo> CallsWithMetadata;
2037 if (!FS->allocs().empty()) {
2038 for (auto &AN : FS->mutableAllocs()) {
2039 // This can happen because of recursion elimination handling that
2040 // currently exists in ModuleSummaryAnalysis. Skip these for now.
2041 // We still added them to the summary because we need to be able to
2042 // correlate properly in applyImport in the backends.
2043 if (AN.MIBs.empty())
2044 continue;
2045 IndexCall AllocCall(&AN);
2046 CallsWithMetadata.push_back(AllocCall);
2047 auto *AllocNode = addAllocNode(AllocCall, FS);
2048 // Pass an empty CallStack to the CallsiteContext (second)
2049 // parameter, since for ThinLTO we already collapsed out the inlined
2050 // stack ids on the allocation call during ModuleSummaryAnalysis.
2052 EmptyContext;
2053 unsigned I = 0;
2054 assert(
2056 AN.ContextSizeInfos.size() == AN.MIBs.size());
2057 // Now add all of the MIBs and their stack nodes.
2058 for (auto &MIB : AN.MIBs) {
2060 StackContext(&MIB);
2061 std::vector<ContextTotalSize> ContextSizeInfo;
2062 if (!AN.ContextSizeInfos.empty()) {
2063 for (auto [FullStackId, TotalSize] : AN.ContextSizeInfos[I])
2064 ContextSizeInfo.push_back({FullStackId, TotalSize});
2065 }
2066 addStackNodesForMIB<MIBInfo, SmallVector<unsigned>::const_iterator>(
2067 AllocNode, StackContext, EmptyContext, MIB.AllocType,
2068 ContextSizeInfo);
2069 I++;
2070 }
2071 assert(AllocNode->AllocTypes != (uint8_t)AllocationType::None);
2072 // Initialize version 0 on the summary alloc node to the current alloc
2073 // type, unless it has both types in which case make it default, so
2074 // that in the case where we aren't able to clone the original version
2075 // always ends up with the default allocation behavior.
2076 AN.Versions[0] = (uint8_t)allocTypeToUse(AllocNode->AllocTypes);
2077 }
2078 }
2079 // For callsite metadata, add to list for this function for later use.
2080 if (!FS->callsites().empty())
2081 for (auto &SN : FS->mutableCallsites()) {
2082 IndexCall StackNodeCall(&SN);
2083 CallsWithMetadata.push_back(StackNodeCall);
2084 }
2085
2086 if (!CallsWithMetadata.empty())
2087 FuncToCallsWithMetadata[FS] = CallsWithMetadata;
2088
2089 if (!FS->allocs().empty() || !FS->callsites().empty())
2090 FSToVIMap[FS] = VI;
2091 }
2092 }
2093
2094 if (DumpCCG) {
2095 dbgs() << "CCG before updating call stack chains:\n";
2096 dbgs() << *this;
2097 }
2098
2099 if (ExportToDot)
2100 exportToDot("prestackupdate");
2101
2102 updateStackNodes();
2103
2104 handleCallsitesWithMultipleTargets();
2105}
2106
2107template <typename DerivedCCG, typename FuncTy, typename CallTy>
2108void CallsiteContextGraph<DerivedCCG, FuncTy,
2109 CallTy>::handleCallsitesWithMultipleTargets() {
2110 // Look for and workaround callsites that call multiple functions.
2111 // This can happen for indirect calls, which needs better handling, and in
2112 // more rare cases (e.g. macro expansion).
2113 // TODO: To fix this for indirect calls we will want to perform speculative
2114 // devirtualization using either the normal PGO info with ICP, or using the
2115 // information in the profiled MemProf contexts. We can do this prior to
2116 // this transformation for regular LTO, and for ThinLTO we can simulate that
2117 // effect in the summary and perform the actual speculative devirtualization
2118 // while cloning in the ThinLTO backend.
2119
2120 // Keep track of the new nodes synthesized for discovered tail calls missing
2121 // from the profiled contexts.
2122 MapVector<CallInfo, ContextNode *> TailCallToContextNodeMap;
2123
2124 std::vector<std::pair<CallInfo, ContextNode *>> NewCallToNode;
2125 for (auto &Entry : NonAllocationCallToContextNodeMap) {
2126 auto *Node = Entry.second;
2127 assert(Node->Clones.empty());
2128 // Check all node callees and see if in the same function.
2129 // We need to check all of the calls recorded in this Node, because in some
2130 // cases we may have had multiple calls with the same debug info calling
2131 // different callees. This can happen, for example, when an object is
2132 // constructed in the paramter list - the destructor call of the object has
2133 // the same debug info (line/col) as the call the object was passed to.
2134 // Here we will prune any that don't match all callee nodes.
2135 std::vector<CallInfo> AllCalls;
2136 AllCalls.reserve(Node->MatchingCalls.size() + 1);
2137 AllCalls.push_back(Node->Call);
2138 AllCalls.insert(AllCalls.end(), Node->MatchingCalls.begin(),
2139 Node->MatchingCalls.end());
2140
2141 // First see if we can partition the calls by callee function, creating new
2142 // nodes to host each set of calls calling the same callees. This is
2143 // necessary for support indirect calls with ThinLTO, for which we
2144 // synthesized CallsiteInfo records for each target. They will all have the
2145 // same callsite stack ids and would be sharing a context node at this
2146 // point. We need to perform separate cloning for each, which will be
2147 // applied along with speculative devirtualization in the ThinLTO backends
2148 // as needed. Note this does not currently support looking through tail
2149 // calls, it is unclear if we need that for indirect call targets.
2150 // First partition calls by callee func. Map indexed by func, value is
2151 // struct with list of matching calls, assigned node.
2152 if (partitionCallsByCallee(Node, AllCalls, NewCallToNode))
2153 continue;
2154
2155 auto It = AllCalls.begin();
2156 // Iterate through the calls until we find the first that matches.
2157 for (; It != AllCalls.end(); ++It) {
2158 auto ThisCall = *It;
2159 bool Match = true;
2160 for (auto EI = Node->CalleeEdges.begin(); EI != Node->CalleeEdges.end();
2161 ++EI) {
2162 auto Edge = *EI;
2163 if (!Edge->Callee->hasCall())
2164 continue;
2165 assert(NodeToCallingFunc.count(Edge->Callee));
2166 // Check if the called function matches that of the callee node.
2167 if (!calleesMatch(ThisCall.call(), EI, TailCallToContextNodeMap)) {
2168 Match = false;
2169 break;
2170 }
2171 }
2172 // Found a call that matches the callee nodes, we can quit now.
2173 if (Match) {
2174 // If the first match is not the primary call on the Node, update it
2175 // now. We will update the list of matching calls further below.
2176 if (Node->Call != ThisCall) {
2177 Node->setCall(ThisCall);
2178 // We need to update the NonAllocationCallToContextNodeMap, but don't
2179 // want to do this during iteration over that map, so save the calls
2180 // that need updated entries.
2181 NewCallToNode.push_back({ThisCall, Node});
2182 }
2183 break;
2184 }
2185 }
2186 // We will update this list below (or leave it cleared if there was no
2187 // match found above).
2188 Node->MatchingCalls.clear();
2189 // If we hit the end of the AllCalls vector, no call matching the callee
2190 // nodes was found, clear the call information in the node.
2191 if (It == AllCalls.end()) {
2192 RemovedEdgesWithMismatchedCallees++;
2193 // Work around by setting Node to have a null call, so it gets
2194 // skipped during cloning. Otherwise assignFunctions will assert
2195 // because its data structures are not designed to handle this case.
2196 Node->setCall(CallInfo());
2197 continue;
2198 }
2199 // Now add back any matching calls that call the same function as the
2200 // matching primary call on Node.
2201 for (++It; It != AllCalls.end(); ++It) {
2202 auto ThisCall = *It;
2203 if (!sameCallee(Node->Call.call(), ThisCall.call()))
2204 continue;
2205 Node->MatchingCalls.push_back(ThisCall);
2206 }
2207 }
2208
2209 // Remove all mismatched nodes identified in the above loop from the node map
2210 // (checking whether they have a null call which is set above). For a
2211 // MapVector like NonAllocationCallToContextNodeMap it is much more efficient
2212 // to do the removal via remove_if than by individually erasing entries above.
2213 // Also remove any entries if we updated the node's primary call above.
2214 NonAllocationCallToContextNodeMap.remove_if([](const auto &it) {
2215 return !it.second->hasCall() || it.second->Call != it.first;
2216 });
2217
2218 // Add entries for any new primary calls recorded above.
2219 for (auto &[Call, Node] : NewCallToNode)
2220 NonAllocationCallToContextNodeMap[Call] = Node;
2221
2222 // Add the new nodes after the above loop so that the iteration is not
2223 // invalidated.
2224 for (auto &[Call, Node] : TailCallToContextNodeMap)
2225 NonAllocationCallToContextNodeMap[Call] = Node;
2226}
2227
2228template <typename DerivedCCG, typename FuncTy, typename CallTy>
2229bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::partitionCallsByCallee(
2230 ContextNode *Node, ArrayRef<CallInfo> AllCalls,
2231 std::vector<std::pair<CallInfo, ContextNode *>> &NewCallToNode) {
2232 // Struct to keep track of all the calls having the same callee function,
2233 // and the node we eventually assign to them. Eventually we will record the
2234 // context node assigned to this group of calls.
2235 struct CallsWithSameCallee {
2236 std::vector<CallInfo> Calls;
2237 ContextNode *Node = nullptr;
2238 };
2239
2240 // First partition calls by callee function. Build map from each function
2241 // to the list of matching calls.
2243 for (auto ThisCall : AllCalls) {
2244 auto *F = getCalleeFunc(ThisCall.call());
2245 if (F)
2246 CalleeFuncToCallInfo[F].Calls.push_back(ThisCall);
2247 }
2248
2249 // Next, walk through all callee edges. For each callee node, get its
2250 // containing function and see if it was recorded in the above map (meaning we
2251 // have at least one matching call). Build another map from each callee node
2252 // with a matching call to the structure instance created above containing all
2253 // the calls.
2255 for (const auto &Edge : Node->CalleeEdges) {
2256 if (!Edge->Callee->hasCall())
2257 continue;
2258 const FuncTy *ProfiledCalleeFunc = NodeToCallingFunc[Edge->Callee];
2259 if (CalleeFuncToCallInfo.contains(ProfiledCalleeFunc))
2260 CalleeNodeToCallInfo[Edge->Callee] =
2261 &CalleeFuncToCallInfo[ProfiledCalleeFunc];
2262 }
2263
2264 // If there are entries in the second map, then there were no matching
2265 // calls/callees, nothing to do here. Return so we can go to the handling that
2266 // looks through tail calls.
2267 if (CalleeNodeToCallInfo.empty())
2268 return false;
2269
2270 // Walk through all callee edges again. Any and all callee edges that didn't
2271 // match any calls (callee not in the CalleeNodeToCallInfo map) are moved to a
2272 // new caller node (UnmatchedCalleesNode) which gets a null call so that it is
2273 // ignored during cloning. If it is in the map, then we use the node recorded
2274 // in that entry (creating it if needed), and move the callee edge to it.
2275 // The first callee will use the original node instead of creating a new one.
2276 // Note that any of the original calls on this node (in AllCalls) that didn't
2277 // have a callee function automatically get dropped from the node as part of
2278 // this process.
2279 ContextNode *UnmatchedCalleesNode = nullptr;
2280 // Track whether we already assigned original node to a callee.
2281 bool UsedOrigNode = false;
2282 assert(NodeToCallingFunc[Node]);
2283 for (auto EI = Node->CalleeEdges.begin(); EI != Node->CalleeEdges.end();) {
2284 auto Edge = *EI;
2285 if (!Edge->Callee->hasCall()) {
2286 ++EI;
2287 continue;
2288 }
2289
2290 // Will be updated below to point to whatever (caller) node this callee edge
2291 // should be moved to.
2292 ContextNode *CallerNodeToUse = nullptr;
2293
2294 // Handle the case where there were no matching calls first. Move this
2295 // callee edge to the UnmatchedCalleesNode, creating it if needed.
2296 if (!CalleeNodeToCallInfo.contains(Edge->Callee)) {
2297 if (!UnmatchedCalleesNode)
2298 UnmatchedCalleesNode =
2299 createNewNode(/*IsAllocation=*/false, NodeToCallingFunc[Node]);
2300 CallerNodeToUse = UnmatchedCalleesNode;
2301 } else {
2302 // Look up the information recorded for this callee node, and use the
2303 // recorded caller node (creating it if needed).
2304 auto *Info = CalleeNodeToCallInfo[Edge->Callee];
2305 if (!Info->Node) {
2306 // If we haven't assigned any callees to the original node use it.
2307 if (!UsedOrigNode) {
2308 Info->Node = Node;
2309 // Clear the set of matching calls which will be updated below.
2310 Node->MatchingCalls.clear();
2311 UsedOrigNode = true;
2312 } else
2313 Info->Node =
2314 createNewNode(/*IsAllocation=*/false, NodeToCallingFunc[Node]);
2315 assert(!Info->Calls.empty());
2316 // The first call becomes the primary call for this caller node, and the
2317 // rest go in the matching calls list.
2318 Info->Node->setCall(Info->Calls.front());
2319 Info->Node->MatchingCalls.insert(Info->Node->MatchingCalls.end(),
2320 Info->Calls.begin() + 1,
2321 Info->Calls.end());
2322 // Save the primary call to node correspondence so that we can update
2323 // the NonAllocationCallToContextNodeMap, which is being iterated in the
2324 // caller of this function.
2325 NewCallToNode.push_back({Info->Node->Call, Info->Node});
2326 }
2327 CallerNodeToUse = Info->Node;
2328 }
2329
2330 // Don't need to move edge if we are using the original node;
2331 if (CallerNodeToUse == Node) {
2332 ++EI;
2333 continue;
2334 }
2335
2336 moveCalleeEdgeToNewCaller(EI, CallerNodeToUse);
2337 }
2338 // Now that we are done moving edges, clean up any caller edges that ended
2339 // up with no type or context ids. During moveCalleeEdgeToNewCaller all
2340 // caller edges from Node are replicated onto the new callers, and it
2341 // simplifies the handling to leave them until we have moved all
2342 // edges/context ids.
2343 for (auto &I : CalleeNodeToCallInfo)
2344 removeNoneTypeCallerEdges(I.second->Node);
2345 if (UnmatchedCalleesNode)
2346 removeNoneTypeCallerEdges(UnmatchedCalleesNode);
2347 removeNoneTypeCallerEdges(Node);
2348
2349 return true;
2350}
2351
2352uint64_t ModuleCallsiteContextGraph::getStackId(uint64_t IdOrIndex) const {
2353 // In the Module (IR) case this is already the Id.
2354 return IdOrIndex;
2355}
2356
2357uint64_t IndexCallsiteContextGraph::getStackId(uint64_t IdOrIndex) const {
2358 // In the Index case this is an index into the stack id list in the summary
2359 // index, convert it to an Id.
2360 return Index.getStackIdAtIndex(IdOrIndex);
2361}
2362
2363template <typename DerivedCCG, typename FuncTy, typename CallTy>
2364bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::calleesMatch(
2365 CallTy Call, EdgeIter &EI,
2366 MapVector<CallInfo, ContextNode *> &TailCallToContextNodeMap) {
2367 auto Edge = *EI;
2368 const FuncTy *ProfiledCalleeFunc = NodeToCallingFunc[Edge->Callee];
2369 const FuncTy *CallerFunc = NodeToCallingFunc[Edge->Caller];
2370 // Will be populated in order of callee to caller if we find a chain of tail
2371 // calls between the profiled caller and callee.
2372 std::vector<std::pair<CallTy, FuncTy *>> FoundCalleeChain;
2373 if (!calleeMatchesFunc(Call, ProfiledCalleeFunc, CallerFunc,
2374 FoundCalleeChain))
2375 return false;
2376
2377 // The usual case where the profiled callee matches that of the IR/summary.
2378 if (FoundCalleeChain.empty())
2379 return true;
2380
2381 auto AddEdge = [Edge, &EI](ContextNode *Caller, ContextNode *Callee) {
2382 auto *CurEdge = Callee->findEdgeFromCaller(Caller);
2383 // If there is already an edge between these nodes, simply update it and
2384 // return.
2385 if (CurEdge) {
2386 CurEdge->ContextIds.insert(Edge->ContextIds.begin(),
2387 Edge->ContextIds.end());
2388 CurEdge->AllocTypes |= Edge->AllocTypes;
2389 return;
2390 }
2391 // Otherwise, create a new edge and insert it into the caller and callee
2392 // lists.
2393 auto NewEdge = std::make_shared<ContextEdge>(
2394 Callee, Caller, Edge->AllocTypes, Edge->ContextIds);
2395 Callee->CallerEdges.push_back(NewEdge);
2396 if (Caller == Edge->Caller) {
2397 // If we are inserting the new edge into the current edge's caller, insert
2398 // the new edge before the current iterator position, and then increment
2399 // back to the current edge.
2400 EI = Caller->CalleeEdges.insert(EI, NewEdge);
2401 ++EI;
2402 assert(*EI == Edge &&
2403 "Iterator position not restored after insert and increment");
2404 } else
2405 Caller->CalleeEdges.push_back(NewEdge);
2406 };
2407
2408 // Create new nodes for each found callee and connect in between the profiled
2409 // caller and callee.
2410 auto *CurCalleeNode = Edge->Callee;
2411 for (auto &[NewCall, Func] : FoundCalleeChain) {
2412 ContextNode *NewNode = nullptr;
2413 // First check if we have already synthesized a node for this tail call.
2414 if (TailCallToContextNodeMap.count(NewCall)) {
2415 NewNode = TailCallToContextNodeMap[NewCall];
2416 NewNode->AllocTypes |= Edge->AllocTypes;
2417 } else {
2418 FuncToCallsWithMetadata[Func].push_back({NewCall});
2419 // Create Node and record node info.
2420 NewNode = createNewNode(/*IsAllocation=*/false, Func, NewCall);
2421 TailCallToContextNodeMap[NewCall] = NewNode;
2422 NewNode->AllocTypes = Edge->AllocTypes;
2423 }
2424
2425 // Hook up node to its callee node
2426 AddEdge(NewNode, CurCalleeNode);
2427
2428 CurCalleeNode = NewNode;
2429 }
2430
2431 // Hook up edge's original caller to new callee node.
2432 AddEdge(Edge->Caller, CurCalleeNode);
2433
2434#ifndef NDEBUG
2435 // Save this because Edge's fields get cleared below when removed.
2436 auto *Caller = Edge->Caller;
2437#endif
2438
2439 // Remove old edge
2440 removeEdgeFromGraph(Edge.get(), &EI, /*CalleeIter=*/true);
2441
2442 // To simplify the increment of EI in the caller, subtract one from EI.
2443 // In the final AddEdge call we would have either added a new callee edge,
2444 // to Edge->Caller, or found an existing one. Either way we are guaranteed
2445 // that there is at least one callee edge.
2446 assert(!Caller->CalleeEdges.empty());
2447 --EI;
2448
2449 return true;
2450}
2451
2452bool ModuleCallsiteContextGraph::findProfiledCalleeThroughTailCalls(
2453 const Function *ProfiledCallee, Value *CurCallee, unsigned Depth,
2454 std::vector<std::pair<Instruction *, Function *>> &FoundCalleeChain,
2455 bool &FoundMultipleCalleeChains) {
2456 // Stop recursive search if we have already explored the maximum specified
2457 // depth.
2459 return false;
2460
2461 auto SaveCallsiteInfo = [&](Instruction *Callsite, Function *F) {
2462 FoundCalleeChain.push_back({Callsite, F});
2463 };
2464
2465 auto *CalleeFunc = dyn_cast<Function>(CurCallee);
2466 if (!CalleeFunc) {
2467 auto *Alias = dyn_cast<GlobalAlias>(CurCallee);
2468 assert(Alias);
2469 CalleeFunc = dyn_cast<Function>(Alias->getAliasee());
2470 assert(CalleeFunc);
2471 }
2472
2473 // Look for tail calls in this function, and check if they either call the
2474 // profiled callee directly, or indirectly (via a recursive search).
2475 // Only succeed if there is a single unique tail call chain found between the
2476 // profiled caller and callee, otherwise we could perform incorrect cloning.
2477 bool FoundSingleCalleeChain = false;
2478 for (auto &BB : *CalleeFunc) {
2479 for (auto &I : BB) {
2480 auto *CB = dyn_cast<CallBase>(&I);
2481 if (!CB || !CB->isTailCall())
2482 continue;
2483 auto *CalledValue = CB->getCalledOperand();
2484 auto *CalledFunction = CB->getCalledFunction();
2485 if (CalledValue && !CalledFunction) {
2486 CalledValue = CalledValue->stripPointerCasts();
2487 // Stripping pointer casts can reveal a called function.
2488 CalledFunction = dyn_cast<Function>(CalledValue);
2489 }
2490 // Check if this is an alias to a function. If so, get the
2491 // called aliasee for the checks below.
2492 if (auto *GA = dyn_cast<GlobalAlias>(CalledValue)) {
2493 assert(!CalledFunction &&
2494 "Expected null called function in callsite for alias");
2495 CalledFunction = dyn_cast<Function>(GA->getAliaseeObject());
2496 }
2497 if (!CalledFunction)
2498 continue;
2499 if (CalledFunction == ProfiledCallee) {
2500 if (FoundSingleCalleeChain) {
2501 FoundMultipleCalleeChains = true;
2502 return false;
2503 }
2504 FoundSingleCalleeChain = true;
2505 FoundProfiledCalleeCount++;
2506 FoundProfiledCalleeDepth += Depth;
2507 if (Depth > FoundProfiledCalleeMaxDepth)
2508 FoundProfiledCalleeMaxDepth = Depth;
2509 SaveCallsiteInfo(&I, CalleeFunc);
2510 } else if (findProfiledCalleeThroughTailCalls(
2511 ProfiledCallee, CalledFunction, Depth + 1,
2512 FoundCalleeChain, FoundMultipleCalleeChains)) {
2513 // findProfiledCalleeThroughTailCalls should not have returned
2514 // true if FoundMultipleCalleeChains.
2515 assert(!FoundMultipleCalleeChains);
2516 if (FoundSingleCalleeChain) {
2517 FoundMultipleCalleeChains = true;
2518 return false;
2519 }
2520 FoundSingleCalleeChain = true;
2521 SaveCallsiteInfo(&I, CalleeFunc);
2522 } else if (FoundMultipleCalleeChains)
2523 return false;
2524 }
2525 }
2526
2527 return FoundSingleCalleeChain;
2528}
2529
2530const Function *ModuleCallsiteContextGraph::getCalleeFunc(Instruction *Call) {
2531 auto *CB = dyn_cast<CallBase>(Call);
2532 if (!CB->getCalledOperand() || CB->isIndirectCall())
2533 return nullptr;
2534 auto *CalleeVal = CB->getCalledOperand()->stripPointerCasts();
2535 auto *Alias = dyn_cast<GlobalAlias>(CalleeVal);
2536 if (Alias)
2537 return dyn_cast<Function>(Alias->getAliasee());
2538 return dyn_cast<Function>(CalleeVal);
2539}
2540
2541bool ModuleCallsiteContextGraph::calleeMatchesFunc(
2542 Instruction *Call, const Function *Func, const Function *CallerFunc,
2543 std::vector<std::pair<Instruction *, Function *>> &FoundCalleeChain) {
2544 auto *CB = dyn_cast<CallBase>(Call);
2545 if (!CB->getCalledOperand() || CB->isIndirectCall())
2546 return false;
2547 auto *CalleeVal = CB->getCalledOperand()->stripPointerCasts();
2548 auto *CalleeFunc = dyn_cast<Function>(CalleeVal);
2549 if (CalleeFunc == Func)
2550 return true;
2551 auto *Alias = dyn_cast<GlobalAlias>(CalleeVal);
2552 if (Alias && Alias->getAliasee() == Func)
2553 return true;
2554
2555 // Recursively search for the profiled callee through tail calls starting with
2556 // the actual Callee. The discovered tail call chain is saved in
2557 // FoundCalleeChain, and we will fixup the graph to include these callsites
2558 // after returning.
2559 // FIXME: We will currently redo the same recursive walk if we find the same
2560 // mismatched callee from another callsite. We can improve this with more
2561 // bookkeeping of the created chain of new nodes for each mismatch.
2562 unsigned Depth = 1;
2563 bool FoundMultipleCalleeChains = false;
2564 if (!findProfiledCalleeThroughTailCalls(Func, CalleeVal, Depth,
2565 FoundCalleeChain,
2566 FoundMultipleCalleeChains)) {
2567 LLVM_DEBUG(dbgs() << "Not found through unique tail call chain: "
2568 << Func->getName() << " from " << CallerFunc->getName()
2569 << " that actually called " << CalleeVal->getName()
2570 << (FoundMultipleCalleeChains
2571 ? " (found multiple possible chains)"
2572 : "")
2573 << "\n");
2574 if (FoundMultipleCalleeChains)
2575 FoundProfiledCalleeNonUniquelyCount++;
2576 return false;
2577 }
2578
2579 return true;
2580}
2581
2582bool ModuleCallsiteContextGraph::sameCallee(Instruction *Call1,
2583 Instruction *Call2) {
2584 auto *CB1 = cast<CallBase>(Call1);
2585 if (!CB1->getCalledOperand() || CB1->isIndirectCall())
2586 return false;
2587 auto *CalleeVal1 = CB1->getCalledOperand()->stripPointerCasts();
2588 auto *CalleeFunc1 = dyn_cast<Function>(CalleeVal1);
2589 auto *CB2 = cast<CallBase>(Call2);
2590 if (!CB2->getCalledOperand() || CB2->isIndirectCall())
2591 return false;
2592 auto *CalleeVal2 = CB2->getCalledOperand()->stripPointerCasts();
2593 auto *CalleeFunc2 = dyn_cast<Function>(CalleeVal2);
2594 return CalleeFunc1 == CalleeFunc2;
2595}
2596
2597bool IndexCallsiteContextGraph::findProfiledCalleeThroughTailCalls(
2598 ValueInfo ProfiledCallee, ValueInfo CurCallee, unsigned Depth,
2599 std::vector<std::pair<IndexCall, FunctionSummary *>> &FoundCalleeChain,
2600 bool &FoundMultipleCalleeChains) {
2601 // Stop recursive search if we have already explored the maximum specified
2602 // depth.
2604 return false;
2605
2606 auto CreateAndSaveCallsiteInfo = [&](ValueInfo Callee, FunctionSummary *FS) {
2607 // Make a CallsiteInfo for each discovered callee, if one hasn't already
2608 // been synthesized.
2609 if (!FunctionCalleesToSynthesizedCallsiteInfos.count(FS) ||
2610 !FunctionCalleesToSynthesizedCallsiteInfos[FS].count(Callee))
2611 // StackIds is empty (we don't have debug info available in the index for
2612 // these callsites)
2613 FunctionCalleesToSynthesizedCallsiteInfos[FS][Callee] =
2614 std::make_unique<CallsiteInfo>(Callee, SmallVector<unsigned>());
2615 CallsiteInfo *NewCallsiteInfo =
2616 FunctionCalleesToSynthesizedCallsiteInfos[FS][Callee].get();
2617 FoundCalleeChain.push_back({NewCallsiteInfo, FS});
2618 };
2619
2620 // Look for tail calls in this function, and check if they either call the
2621 // profiled callee directly, or indirectly (via a recursive search).
2622 // Only succeed if there is a single unique tail call chain found between the
2623 // profiled caller and callee, otherwise we could perform incorrect cloning.
2624 bool FoundSingleCalleeChain = false;
2625 for (auto &S : CurCallee.getSummaryList()) {
2626 if (!GlobalValue::isLocalLinkage(S->linkage()) &&
2627 !isPrevailing(CurCallee.getGUID(), S.get()))
2628 continue;
2629 auto *FS = dyn_cast<FunctionSummary>(S->getBaseObject());
2630 if (!FS)
2631 continue;
2632 auto FSVI = CurCallee;
2633 auto *AS = dyn_cast<AliasSummary>(S.get());
2634 if (AS)
2635 FSVI = AS->getAliaseeVI();
2636 for (auto &CallEdge : FS->calls()) {
2637 if (!CallEdge.second.hasTailCall())
2638 continue;
2639 if (CallEdge.first == ProfiledCallee) {
2640 if (FoundSingleCalleeChain) {
2641 FoundMultipleCalleeChains = true;
2642 return false;
2643 }
2644 FoundSingleCalleeChain = true;
2645 FoundProfiledCalleeCount++;
2646 FoundProfiledCalleeDepth += Depth;
2647 if (Depth > FoundProfiledCalleeMaxDepth)
2648 FoundProfiledCalleeMaxDepth = Depth;
2649 CreateAndSaveCallsiteInfo(CallEdge.first, FS);
2650 // Add FS to FSToVIMap in case it isn't already there.
2651 assert(!FSToVIMap.count(FS) || FSToVIMap[FS] == FSVI);
2652 FSToVIMap[FS] = FSVI;
2653 } else if (findProfiledCalleeThroughTailCalls(
2654 ProfiledCallee, CallEdge.first, Depth + 1,
2655 FoundCalleeChain, FoundMultipleCalleeChains)) {
2656 // findProfiledCalleeThroughTailCalls should not have returned
2657 // true if FoundMultipleCalleeChains.
2658 assert(!FoundMultipleCalleeChains);
2659 if (FoundSingleCalleeChain) {
2660 FoundMultipleCalleeChains = true;
2661 return false;
2662 }
2663 FoundSingleCalleeChain = true;
2664 CreateAndSaveCallsiteInfo(CallEdge.first, FS);
2665 // Add FS to FSToVIMap in case it isn't already there.
2666 assert(!FSToVIMap.count(FS) || FSToVIMap[FS] == FSVI);
2667 FSToVIMap[FS] = FSVI;
2668 } else if (FoundMultipleCalleeChains)
2669 return false;
2670 }
2671 }
2672
2673 return FoundSingleCalleeChain;
2674}
2675
2676const FunctionSummary *
2677IndexCallsiteContextGraph::getCalleeFunc(IndexCall &Call) {
2679 dyn_cast_if_present<CallsiteInfo *>(Call.getBase())->Callee;
2680 if (Callee.getSummaryList().empty())
2681 return nullptr;
2682 return dyn_cast<FunctionSummary>(Callee.getSummaryList()[0]->getBaseObject());
2683}
2684
2685bool IndexCallsiteContextGraph::calleeMatchesFunc(
2686 IndexCall &Call, const FunctionSummary *Func,
2687 const FunctionSummary *CallerFunc,
2688 std::vector<std::pair<IndexCall, FunctionSummary *>> &FoundCalleeChain) {
2690 dyn_cast_if_present<CallsiteInfo *>(Call.getBase())->Callee;
2691 // If there is no summary list then this is a call to an externally defined
2692 // symbol.
2693 AliasSummary *Alias =
2694 Callee.getSummaryList().empty()
2695 ? nullptr
2696 : dyn_cast<AliasSummary>(Callee.getSummaryList()[0].get());
2697 assert(FSToVIMap.count(Func));
2698 auto FuncVI = FSToVIMap[Func];
2699 if (Callee == FuncVI ||
2700 // If callee is an alias, check the aliasee, since only function
2701 // summary base objects will contain the stack node summaries and thus
2702 // get a context node.
2703 (Alias && Alias->getAliaseeVI() == FuncVI))
2704 return true;
2705
2706 // Recursively search for the profiled callee through tail calls starting with
2707 // the actual Callee. The discovered tail call chain is saved in
2708 // FoundCalleeChain, and we will fixup the graph to include these callsites
2709 // after returning.
2710 // FIXME: We will currently redo the same recursive walk if we find the same
2711 // mismatched callee from another callsite. We can improve this with more
2712 // bookkeeping of the created chain of new nodes for each mismatch.
2713 unsigned Depth = 1;
2714 bool FoundMultipleCalleeChains = false;
2715 if (!findProfiledCalleeThroughTailCalls(
2716 FuncVI, Callee, Depth, FoundCalleeChain, FoundMultipleCalleeChains)) {
2717 LLVM_DEBUG(dbgs() << "Not found through unique tail call chain: " << FuncVI
2718 << " from " << FSToVIMap[CallerFunc]
2719 << " that actually called " << Callee
2720 << (FoundMultipleCalleeChains
2721 ? " (found multiple possible chains)"
2722 : "")
2723 << "\n");
2724 if (FoundMultipleCalleeChains)
2725 FoundProfiledCalleeNonUniquelyCount++;
2726 return false;
2727 }
2728
2729 return true;
2730}
2731
2732bool IndexCallsiteContextGraph::sameCallee(IndexCall &Call1, IndexCall &Call2) {
2733 ValueInfo Callee1 =
2734 dyn_cast_if_present<CallsiteInfo *>(Call1.getBase())->Callee;
2735 ValueInfo Callee2 =
2736 dyn_cast_if_present<CallsiteInfo *>(Call2.getBase())->Callee;
2737 return Callee1 == Callee2;
2738}
2739
2740template <typename DerivedCCG, typename FuncTy, typename CallTy>
2741void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::dump()
2742 const {
2743 print(dbgs());
2744 dbgs() << "\n";
2745}
2746
2747template <typename DerivedCCG, typename FuncTy, typename CallTy>
2748void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::print(
2749 raw_ostream &OS) const {
2750 OS << "Node " << this << "\n";
2751 OS << "\t";
2752 printCall(OS);
2753 if (Recursive)
2754 OS << " (recursive)";
2755 OS << "\n";
2756 if (!MatchingCalls.empty()) {
2757 OS << "\tMatchingCalls:\n";
2758 for (auto &MatchingCall : MatchingCalls) {
2759 OS << "\t";
2760 MatchingCall.print(OS);
2761 OS << "\n";
2762 }
2763 }
2764 OS << "\tAllocTypes: " << getAllocTypeString(AllocTypes) << "\n";
2765 OS << "\tContextIds:";
2766 // Make a copy of the computed context ids that we can sort for stability.
2767 auto ContextIds = getContextIds();
2768 std::vector<uint32_t> SortedIds(ContextIds.begin(), ContextIds.end());
2769 std::sort(SortedIds.begin(), SortedIds.end());
2770 for (auto Id : SortedIds)
2771 OS << " " << Id;
2772 OS << "\n";
2773 OS << "\tCalleeEdges:\n";
2774 for (auto &Edge : CalleeEdges)
2775 OS << "\t\t" << *Edge << "\n";
2776 OS << "\tCallerEdges:\n";
2777 for (auto &Edge : CallerEdges)
2778 OS << "\t\t" << *Edge << "\n";
2779 if (!Clones.empty()) {
2780 OS << "\tClones: ";
2781 ListSeparator LS;
2782 for (auto *Clone : Clones)
2783 OS << LS << Clone;
2784 OS << "\n";
2785 } else if (CloneOf) {
2786 OS << "\tClone of " << CloneOf << "\n";
2787 }
2788}
2789
2790template <typename DerivedCCG, typename FuncTy, typename CallTy>
2791void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextEdge::dump()
2792 const {
2793 print(dbgs());
2794 dbgs() << "\n";
2795}
2796
2797template <typename DerivedCCG, typename FuncTy, typename CallTy>
2798void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextEdge::print(
2799 raw_ostream &OS) const {
2800 OS << "Edge from Callee " << Callee << " to Caller: " << Caller
2801 << " AllocTypes: " << getAllocTypeString(AllocTypes);
2802 OS << " ContextIds:";
2803 std::vector<uint32_t> SortedIds(ContextIds.begin(), ContextIds.end());
2804 std::sort(SortedIds.begin(), SortedIds.end());
2805 for (auto Id : SortedIds)
2806 OS << " " << Id;
2807}
2808
2809template <typename DerivedCCG, typename FuncTy, typename CallTy>
2810void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::dump() const {
2811 print(dbgs());
2812}
2813
2814template <typename DerivedCCG, typename FuncTy, typename CallTy>
2815void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::print(
2816 raw_ostream &OS) const {
2817 OS << "Callsite Context Graph:\n";
2818 using GraphType = const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *;
2819 for (const auto Node : nodes<GraphType>(this)) {
2820 if (Node->isRemoved())
2821 continue;
2822 Node->print(OS);
2823 OS << "\n";
2824 }
2825}
2826
2827template <typename DerivedCCG, typename FuncTy, typename CallTy>
2828void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::printTotalSizes(
2829 raw_ostream &OS) const {
2830 using GraphType = const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *;
2831 for (const auto Node : nodes<GraphType>(this)) {
2832 if (Node->isRemoved())
2833 continue;
2834 if (!Node->IsAllocation)
2835 continue;
2836 DenseSet<uint32_t> ContextIds = Node->getContextIds();
2837 auto AllocTypeFromCall = getAllocationCallType(Node->Call);
2838 std::vector<uint32_t> SortedIds(ContextIds.begin(), ContextIds.end());
2839 std::sort(SortedIds.begin(), SortedIds.end());
2840 for (auto Id : SortedIds) {
2841 auto TypeI = ContextIdToAllocationType.find(Id);
2842 assert(TypeI != ContextIdToAllocationType.end());
2843 auto CSI = ContextIdToContextSizeInfos.find(Id);
2844 if (CSI != ContextIdToContextSizeInfos.end()) {
2845 for (auto &Info : CSI->second) {
2846 OS << "MemProf hinting: "
2847 << getAllocTypeString((uint8_t)TypeI->second)
2848 << " full allocation context " << Info.FullStackId
2849 << " with total size " << Info.TotalSize << " is "
2850 << getAllocTypeString(Node->AllocTypes) << " after cloning";
2851 if (allocTypeToUse(Node->AllocTypes) != AllocTypeFromCall)
2852 OS << " marked " << getAllocTypeString((uint8_t)AllocTypeFromCall)
2853 << " due to cold byte percent";
2854 OS << "\n";
2855 }
2856 }
2857 }
2858 }
2859}
2860
2861template <typename DerivedCCG, typename FuncTy, typename CallTy>
2862void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::check() const {
2863 using GraphType = const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *;
2864 for (const auto Node : nodes<GraphType>(this)) {
2865 checkNode<DerivedCCG, FuncTy, CallTy>(Node, /*CheckEdges=*/false);
2866 for (auto &Edge : Node->CallerEdges)
2867 checkEdge<DerivedCCG, FuncTy, CallTy>(Edge);
2868 }
2869}
2870
2871template <typename DerivedCCG, typename FuncTy, typename CallTy>
2872struct GraphTraits<const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *> {
2873 using GraphType = const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *;
2874 using NodeRef = const ContextNode<DerivedCCG, FuncTy, CallTy> *;
2875
2876 using NodePtrTy = std::unique_ptr<ContextNode<DerivedCCG, FuncTy, CallTy>>;
2877 static NodeRef getNode(const NodePtrTy &P) { return P.get(); }
2878
2881 decltype(&getNode)>;
2882
2884 return nodes_iterator(G->NodeOwner.begin(), &getNode);
2885 }
2886
2888 return nodes_iterator(G->NodeOwner.end(), &getNode);
2889 }
2890
2892 return G->NodeOwner.begin()->get();
2893 }
2894
2895 using EdgePtrTy = std::shared_ptr<ContextEdge<DerivedCCG, FuncTy, CallTy>>;
2896 static const ContextNode<DerivedCCG, FuncTy, CallTy> *
2898 return P->Callee;
2899 }
2900
2902 mapped_iterator<typename std::vector<std::shared_ptr<ContextEdge<
2903 DerivedCCG, FuncTy, CallTy>>>::const_iterator,
2904 decltype(&GetCallee)>;
2905
2907 return ChildIteratorType(N->CalleeEdges.begin(), &GetCallee);
2908 }
2909
2911 return ChildIteratorType(N->CalleeEdges.end(), &GetCallee);
2912 }
2913};
2914
2915template <typename DerivedCCG, typename FuncTy, typename CallTy>
2916struct DOTGraphTraits<const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *>
2917 : public DefaultDOTGraphTraits {
2918 DOTGraphTraits(bool IsSimple = false) : DefaultDOTGraphTraits(IsSimple) {}
2919
2920 using GraphType = const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *;
2922 using NodeRef = typename GTraits::NodeRef;
2923 using ChildIteratorType = typename GTraits::ChildIteratorType;
2924
2925 static std::string getNodeLabel(NodeRef Node, GraphType G) {
2926 std::string LabelString =
2927 (Twine("OrigId: ") + (Node->IsAllocation ? "Alloc" : "") +
2928 Twine(Node->OrigStackOrAllocId))
2929 .str();
2930 LabelString += "\n";
2931 if (Node->hasCall()) {
2932 auto Func = G->NodeToCallingFunc.find(Node);
2933 assert(Func != G->NodeToCallingFunc.end());
2934 LabelString +=
2935 G->getLabel(Func->second, Node->Call.call(), Node->Call.cloneNo());
2936 } else {
2937 LabelString += "null call";
2938 if (Node->Recursive)
2939 LabelString += " (recursive)";
2940 else
2941 LabelString += " (external)";
2942 }
2943 return LabelString;
2944 }
2945
2946 static std::string getNodeAttributes(NodeRef Node, GraphType) {
2947 std::string AttributeString = (Twine("tooltip=\"") + getNodeId(Node) + " " +
2948 getContextIds(Node->getContextIds()) + "\"")
2949 .str();
2950 AttributeString +=
2951 (Twine(",fillcolor=\"") + getColor(Node->AllocTypes) + "\"").str();
2952 AttributeString += ",style=\"filled\"";
2953 if (Node->CloneOf) {
2954 AttributeString += ",color=\"blue\"";
2955 AttributeString += ",style=\"filled,bold,dashed\"";
2956 } else
2957 AttributeString += ",style=\"filled\"";
2958 return AttributeString;
2959 }
2960
2961 static std::string getEdgeAttributes(NodeRef, ChildIteratorType ChildIter,
2962 GraphType) {
2963 auto &Edge = *(ChildIter.getCurrent());
2964 return (Twine("tooltip=\"") + getContextIds(Edge->ContextIds) + "\"" +
2965 Twine(",fillcolor=\"") + getColor(Edge->AllocTypes) + "\"")
2966 .str();
2967 }
2968
2969 // Since the NodeOwners list includes nodes that are no longer connected to
2970 // the graph, skip them here.
2971 static bool isNodeHidden(NodeRef Node, GraphType) {
2972 return Node->isRemoved();
2973 }
2974
2975private:
2976 static std::string getContextIds(const DenseSet<uint32_t> &ContextIds) {
2977 std::string IdString = "ContextIds:";
2978 if (ContextIds.size() < 100) {
2979 std::vector<uint32_t> SortedIds(ContextIds.begin(), ContextIds.end());
2980 std::sort(SortedIds.begin(), SortedIds.end());
2981 for (auto Id : SortedIds)
2982 IdString += (" " + Twine(Id)).str();
2983 } else {
2984 IdString += (" (" + Twine(ContextIds.size()) + " ids)").str();
2985 }
2986 return IdString;
2987 }
2988
2989 static std::string getColor(uint8_t AllocTypes) {
2990 if (AllocTypes == (uint8_t)AllocationType::NotCold)
2991 // Color "brown1" actually looks like a lighter red.
2992 return "brown1";
2993 if (AllocTypes == (uint8_t)AllocationType::Cold)
2994 return "cyan";
2995 if (AllocTypes ==
2997 // Lighter purple.
2998 return "mediumorchid1";
2999 return "gray";
3000 }
3001
3002 static std::string getNodeId(NodeRef Node) {
3003 std::stringstream SStream;
3004 SStream << std::hex << "N0x" << (unsigned long long)Node;
3005 std::string Result = SStream.str();
3006 return Result;
3007 }
3008};
3009
3010template <typename DerivedCCG, typename FuncTy, typename CallTy>
3011void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::exportToDot(
3012 std::string Label) const {
3013 WriteGraph(this, "", false, Label,
3014 DotFilePathPrefix + "ccg." + Label + ".dot");
3015}
3016
3017template <typename DerivedCCG, typename FuncTy, typename CallTy>
3018typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode *
3019CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::moveEdgeToNewCalleeClone(
3020 const std::shared_ptr<ContextEdge> &Edge, EdgeIter *CallerEdgeI,
3021 DenseSet<uint32_t> ContextIdsToMove) {
3022 ContextNode *Node = Edge->Callee;
3023 assert(NodeToCallingFunc.count(Node));
3024 ContextNode *Clone =
3025 createNewNode(Node->IsAllocation, NodeToCallingFunc[Node], Node->Call);
3026 Node->addClone(Clone);
3027 Clone->MatchingCalls = Node->MatchingCalls;
3028 moveEdgeToExistingCalleeClone(Edge, Clone, CallerEdgeI, /*NewClone=*/true,
3029 ContextIdsToMove);
3030 return Clone;
3031}
3032
3033template <typename DerivedCCG, typename FuncTy, typename CallTy>
3034void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
3035 moveEdgeToExistingCalleeClone(const std::shared_ptr<ContextEdge> &Edge,
3036 ContextNode *NewCallee, EdgeIter *CallerEdgeI,
3037 bool NewClone,
3038 DenseSet<uint32_t> ContextIdsToMove) {
3039 // NewCallee and Edge's current callee must be clones of the same original
3040 // node (Edge's current callee may be the original node too).
3041 assert(NewCallee->getOrigNode() == Edge->Callee->getOrigNode());
3042
3043 ContextNode *OldCallee = Edge->Callee;
3044
3045 // We might already have an edge to the new callee from earlier cloning for a
3046 // different allocation. If one exists we will reuse it.
3047 auto ExistingEdgeToNewCallee = NewCallee->findEdgeFromCaller(Edge->Caller);
3048
3049 // Callers will pass an empty ContextIdsToMove set when they want to move the
3050 // edge. Copy in Edge's ids for simplicity.
3051 if (ContextIdsToMove.empty())
3052 ContextIdsToMove = Edge->getContextIds();
3053
3054 // If we are moving all of Edge's ids, then just move the whole Edge.
3055 // Otherwise only move the specified subset, to a new edge if needed.
3056 if (Edge->getContextIds().size() == ContextIdsToMove.size()) {
3057 // First, update the alloc types on New Callee from Edge.
3058 // Do this before we potentially clear Edge's fields below!
3059 NewCallee->AllocTypes |= Edge->AllocTypes;
3060 // Moving the whole Edge.
3061 if (ExistingEdgeToNewCallee) {
3062 // Since we already have an edge to NewCallee, simply move the ids
3063 // onto it, and remove the existing Edge.
3064 ExistingEdgeToNewCallee->getContextIds().insert(ContextIdsToMove.begin(),
3065 ContextIdsToMove.end());
3066 ExistingEdgeToNewCallee->AllocTypes |= Edge->AllocTypes;
3067 assert(Edge->ContextIds == ContextIdsToMove);
3068 removeEdgeFromGraph(Edge.get(), CallerEdgeI, /*CalleeIter=*/false);
3069 } else {
3070 // Otherwise just reconnect Edge to NewCallee.
3071 Edge->Callee = NewCallee;
3072 NewCallee->CallerEdges.push_back(Edge);
3073 // Remove it from callee where it was previously connected.
3074 if (CallerEdgeI)
3075 *CallerEdgeI = OldCallee->CallerEdges.erase(*CallerEdgeI);
3076 else
3077 OldCallee->eraseCallerEdge(Edge.get());
3078 // Don't need to update Edge's context ids since we are simply
3079 // reconnecting it.
3080 }
3081 } else {
3082 // Only moving a subset of Edge's ids.
3083 if (CallerEdgeI)
3084 ++CallerEdgeI;
3085 // Compute the alloc type of the subset of ids being moved.
3086 auto CallerEdgeAllocType = computeAllocType(ContextIdsToMove);
3087 if (ExistingEdgeToNewCallee) {
3088 // Since we already have an edge to NewCallee, simply move the ids
3089 // onto it.
3090 ExistingEdgeToNewCallee->getContextIds().insert(ContextIdsToMove.begin(),
3091 ContextIdsToMove.end());
3092 ExistingEdgeToNewCallee->AllocTypes |= CallerEdgeAllocType;
3093 } else {
3094 // Otherwise, create a new edge to NewCallee for the ids being moved.
3095 auto NewEdge = std::make_shared<ContextEdge>(
3096 NewCallee, Edge->Caller, CallerEdgeAllocType, ContextIdsToMove);
3097 Edge->Caller->CalleeEdges.push_back(NewEdge);
3098 NewCallee->CallerEdges.push_back(NewEdge);
3099 }
3100 // In either case, need to update the alloc types on NewCallee, and remove
3101 // those ids and update the alloc type on the original Edge.
3102 NewCallee->AllocTypes |= CallerEdgeAllocType;
3103 set_subtract(Edge->ContextIds, ContextIdsToMove);
3104 Edge->AllocTypes = computeAllocType(Edge->ContextIds);
3105 }
3106 // Now walk the old callee node's callee edges and move Edge's context ids
3107 // over to the corresponding edge into the clone (which is created here if
3108 // this is a newly created clone).
3109 for (auto &OldCalleeEdge : OldCallee->CalleeEdges) {
3110 // The context ids moving to the new callee are the subset of this edge's
3111 // context ids and the context ids on the caller edge being moved.
3112 DenseSet<uint32_t> EdgeContextIdsToMove =
3113 set_intersection(OldCalleeEdge->getContextIds(), ContextIdsToMove);
3114 set_subtract(OldCalleeEdge->getContextIds(), EdgeContextIdsToMove);
3115 OldCalleeEdge->AllocTypes =
3116 computeAllocType(OldCalleeEdge->getContextIds());
3117 if (!NewClone) {
3118 // Update context ids / alloc type on corresponding edge to NewCallee.
3119 // There is a chance this may not exist if we are reusing an existing
3120 // clone, specifically during function assignment, where we would have
3121 // removed none type edges after creating the clone. If we can't find
3122 // a corresponding edge there, fall through to the cloning below.
3123 if (auto *NewCalleeEdge =
3124 NewCallee->findEdgeFromCallee(OldCalleeEdge->Callee)) {
3125 NewCalleeEdge->getContextIds().insert(EdgeContextIdsToMove.begin(),
3126 EdgeContextIdsToMove.end());
3127 NewCalleeEdge->AllocTypes |= computeAllocType(EdgeContextIdsToMove);
3128 continue;
3129 }
3130 }
3131 auto NewEdge = std::make_shared<ContextEdge>(
3132 OldCalleeEdge->Callee, NewCallee,
3133 computeAllocType(EdgeContextIdsToMove), EdgeContextIdsToMove);
3134 NewCallee->CalleeEdges.push_back(NewEdge);
3135 NewEdge->Callee->CallerEdges.push_back(NewEdge);
3136 }
3137 // Recompute the node alloc type now that its callee edges have been
3138 // updated (since we will compute from those edges).
3139 OldCallee->AllocTypes = OldCallee->computeAllocType();
3140 // OldCallee alloc type should be None iff its context id set is now empty.
3141 assert((OldCallee->AllocTypes == (uint8_t)AllocationType::None) ==
3142 OldCallee->emptyContextIds());
3143 if (VerifyCCG) {
3144 checkNode<DerivedCCG, FuncTy, CallTy>(OldCallee, /*CheckEdges=*/false);
3145 checkNode<DerivedCCG, FuncTy, CallTy>(NewCallee, /*CheckEdges=*/false);
3146 for (const auto &OldCalleeEdge : OldCallee->CalleeEdges)
3147 checkNode<DerivedCCG, FuncTy, CallTy>(OldCalleeEdge->Callee,
3148 /*CheckEdges=*/false);
3149 for (const auto &NewCalleeEdge : NewCallee->CalleeEdges)
3150 checkNode<DerivedCCG, FuncTy, CallTy>(NewCalleeEdge->Callee,
3151 /*CheckEdges=*/false);
3152 }
3153}
3154
3155template <typename DerivedCCG, typename FuncTy, typename CallTy>
3156void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
3157 moveCalleeEdgeToNewCaller(EdgeIter &CalleeEdgeI, ContextNode *NewCaller) {
3158 auto Edge = *CalleeEdgeI;
3159
3160 ContextNode *OldCaller = Edge->Caller;
3161
3162 // We might already have an edge to the new caller. If one exists we will
3163 // reuse it.
3164 auto ExistingEdgeToNewCaller = NewCaller->findEdgeFromCallee(Edge->Callee);
3165
3166 CalleeEdgeI = OldCaller->CalleeEdges.erase(CalleeEdgeI);
3167 if (ExistingEdgeToNewCaller) {
3168 // Since we already have an edge to NewCaller, simply move the ids
3169 // onto it, and remove the existing Edge.
3170 ExistingEdgeToNewCaller->getContextIds().insert(
3171 Edge->getContextIds().begin(), Edge->getContextIds().end());
3172 ExistingEdgeToNewCaller->AllocTypes |= Edge->AllocTypes;
3173 Edge->ContextIds.clear();
3174 Edge->AllocTypes = (uint8_t)AllocationType::None;
3175 Edge->Callee->eraseCallerEdge(Edge.get());
3176 } else {
3177 // Otherwise just reconnect Edge to NewCaller.
3178 Edge->Caller = NewCaller;
3179 NewCaller->CalleeEdges.push_back(Edge);
3180 // Don't need to update Edge's context ids since we are simply
3181 // reconnecting it.
3182 }
3183 // In either case, need to update the alloc types on New Caller.
3184 NewCaller->AllocTypes |= Edge->AllocTypes;
3185
3186 // Now walk the old caller node's caller edges and move Edge's context ids
3187 // over to the corresponding edge into the node (which is created here if
3188 // this is a newly created node). We can tell whether this is a newly created
3189 // node by seeing if it has any caller edges yet.
3190#ifndef NDEBUG
3191 bool IsNewNode = NewCaller->CallerEdges.empty();
3192#endif
3193 for (auto &OldCallerEdge : OldCaller->CallerEdges) {
3194 // The context ids moving to the new caller are the subset of this edge's
3195 // context ids and the context ids on the callee edge being moved.
3196 DenseSet<uint32_t> EdgeContextIdsToMove =
3197 set_intersection(OldCallerEdge->getContextIds(), Edge->getContextIds());
3198 set_subtract(OldCallerEdge->getContextIds(), EdgeContextIdsToMove);
3199 OldCallerEdge->AllocTypes =
3200 computeAllocType(OldCallerEdge->getContextIds());
3201 // In this function we expect that any pre-existing node already has edges
3202 // from the same callers as the old node. That should be true in the current
3203 // use case, where we will remove None-type edges after copying over all
3204 // caller edges from the callee.
3205 auto *ExistingCallerEdge =
3206 NewCaller->findEdgeFromCaller(OldCallerEdge->Caller);
3207 assert(IsNewNode || ExistingCallerEdge);
3208 if (ExistingCallerEdge) {
3209 ExistingCallerEdge->getContextIds().insert(EdgeContextIdsToMove.begin(),
3210 EdgeContextIdsToMove.end());
3211 ExistingCallerEdge->AllocTypes |= computeAllocType(EdgeContextIdsToMove);
3212 continue;
3213 }
3214 auto NewEdge = std::make_shared<ContextEdge>(
3215 NewCaller, OldCallerEdge->Caller,
3216 computeAllocType(EdgeContextIdsToMove), EdgeContextIdsToMove);
3217 NewCaller->CallerEdges.push_back(NewEdge);
3218 NewEdge->Caller->CalleeEdges.push_back(NewEdge);
3219 }
3220 // Recompute the node alloc type now that its caller edges have been
3221 // updated (since we will compute from those edges).
3222 OldCaller->AllocTypes = OldCaller->computeAllocType();
3223 // OldCaller alloc type should be None iff its context id set is now empty.
3224 assert((OldCaller->AllocTypes == (uint8_t)AllocationType::None) ==
3225 OldCaller->emptyContextIds());
3226 if (VerifyCCG) {
3227 checkNode<DerivedCCG, FuncTy, CallTy>(OldCaller, /*CheckEdges=*/false);
3228 checkNode<DerivedCCG, FuncTy, CallTy>(NewCaller, /*CheckEdges=*/false);
3229 for (const auto &OldCallerEdge : OldCaller->CallerEdges)
3230 checkNode<DerivedCCG, FuncTy, CallTy>(OldCallerEdge->Caller,
3231 /*CheckEdges=*/false);
3232 for (const auto &NewCallerEdge : NewCaller->CallerEdges)
3233 checkNode<DerivedCCG, FuncTy, CallTy>(NewCallerEdge->Caller,
3234 /*CheckEdges=*/false);
3235 }
3236}
3237
3238template <typename DerivedCCG, typename FuncTy, typename CallTy>
3239void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
3240 recursivelyRemoveNoneTypeCalleeEdges(
3241 ContextNode *Node, DenseSet<const ContextNode *> &Visited) {
3242 auto Inserted = Visited.insert(Node);
3243 if (!Inserted.second)
3244 return;
3245
3246 removeNoneTypeCalleeEdges(Node);
3247
3248 for (auto *Clone : Node->Clones)
3249 recursivelyRemoveNoneTypeCalleeEdges(Clone, Visited);
3250
3251 // The recursive call may remove some of this Node's caller edges.
3252 // Iterate over a copy and skip any that were removed.
3253 auto CallerEdges = Node->CallerEdges;
3254 for (auto &Edge : CallerEdges) {
3255 // Skip any that have been removed by an earlier recursive call.
3256 if (Edge->isRemoved()) {
3257 assert(!is_contained(Node->CallerEdges, Edge));
3258 continue;
3259 }
3260 recursivelyRemoveNoneTypeCalleeEdges(Edge->Caller, Visited);
3261 }
3262}
3263
3264template <typename DerivedCCG, typename FuncTy, typename CallTy>
3265void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::identifyClones() {
3267 for (auto &Entry : AllocationCallToContextNodeMap) {
3268 Visited.clear();
3269 identifyClones(Entry.second, Visited, Entry.second->getContextIds());
3270 }
3271 Visited.clear();
3272 for (auto &Entry : AllocationCallToContextNodeMap)
3273 recursivelyRemoveNoneTypeCalleeEdges(Entry.second, Visited);
3274 if (VerifyCCG)
3275 check();
3276}
3277
3278// helper function to check an AllocType is cold or notcold or both.
3280 return (AllocType == (uint8_t)AllocationType::Cold) ||
3282 (AllocType ==
3284}
3285
3286template <typename DerivedCCG, typename FuncTy, typename CallTy>
3287void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::identifyClones(
3288 ContextNode *Node, DenseSet<const ContextNode *> &Visited,
3289 const DenseSet<uint32_t> &AllocContextIds) {
3290 if (VerifyNodes)
3291 checkNode<DerivedCCG, FuncTy, CallTy>(Node, /*CheckEdges=*/false);
3292 assert(!Node->CloneOf);
3293
3294 // If Node as a null call, then either it wasn't found in the module (regular
3295 // LTO) or summary index (ThinLTO), or there were other conditions blocking
3296 // cloning (e.g. recursion, calls multiple targets, etc).
3297 // Do this here so that we don't try to recursively clone callers below, which
3298 // isn't useful at least for this node.
3299 if (!Node->hasCall())
3300 return;
3301
3302#ifndef NDEBUG
3303 auto Insert =
3304#endif
3305 Visited.insert(Node);
3306 // We should not have visited this node yet.
3307 assert(Insert.second);
3308 // The recursive call to identifyClones may delete the current edge from the
3309 // CallerEdges vector. Make a copy and iterate on that, simpler than passing
3310 // in an iterator and having recursive call erase from it. Other edges may
3311 // also get removed during the recursion, which will have null Callee and
3312 // Caller pointers (and are deleted later), so we skip those below.
3313 {
3314 auto CallerEdges = Node->CallerEdges;
3315 for (auto &Edge : CallerEdges) {
3316 // Skip any that have been removed by an earlier recursive call.
3317 if (Edge->isRemoved()) {
3318 assert(!is_contained(Node->CallerEdges, Edge));
3319 continue;
3320 }
3321 // Ignore any caller we previously visited via another edge.
3322 if (!Visited.count(Edge->Caller) && !Edge->Caller->CloneOf) {
3323 identifyClones(Edge->Caller, Visited, AllocContextIds);
3324 }
3325 }
3326 }
3327
3328 // Check if we reached an unambiguous call or have have only a single caller.
3329 if (hasSingleAllocType(Node->AllocTypes) || Node->CallerEdges.size() <= 1)
3330 return;
3331
3332 // We need to clone.
3333
3334 // Try to keep the original version as alloc type NotCold. This will make
3335 // cases with indirect calls or any other situation with an unknown call to
3336 // the original function get the default behavior. We do this by sorting the
3337 // CallerEdges of the Node we will clone by alloc type.
3338 //
3339 // Give NotCold edge the lowest sort priority so those edges are at the end of
3340 // the caller edges vector, and stay on the original version (since the below
3341 // code clones greedily until it finds all remaining edges have the same type
3342 // and leaves the remaining ones on the original Node).
3343 //
3344 // We shouldn't actually have any None type edges, so the sorting priority for
3345 // that is arbitrary, and we assert in that case below.
3346 const unsigned AllocTypeCloningPriority[] = {/*None*/ 3, /*NotCold*/ 4,
3347 /*Cold*/ 1,
3348 /*NotColdCold*/ 2};
3349 std::stable_sort(Node->CallerEdges.begin(), Node->CallerEdges.end(),
3350 [&](const std::shared_ptr<ContextEdge> &A,
3351 const std::shared_ptr<ContextEdge> &B) {
3352 // Nodes with non-empty context ids should be sorted before
3353 // those with empty context ids.
3354 if (A->ContextIds.empty())
3355 // Either B ContextIds are non-empty (in which case we
3356 // should return false because B < A), or B ContextIds
3357 // are empty, in which case they are equal, and we should
3358 // maintain the original relative ordering.
3359 return false;
3360 if (B->ContextIds.empty())
3361 return true;
3362
3363 if (A->AllocTypes == B->AllocTypes)
3364 // Use the first context id for each edge as a
3365 // tie-breaker.
3366 return *A->ContextIds.begin() < *B->ContextIds.begin();
3367 return AllocTypeCloningPriority[A->AllocTypes] <
3368 AllocTypeCloningPriority[B->AllocTypes];
3369 });
3370
3371 assert(Node->AllocTypes != (uint8_t)AllocationType::None);
3372
3373 // Iterate until we find no more opportunities for disambiguating the alloc
3374 // types via cloning. In most cases this loop will terminate once the Node
3375 // has a single allocation type, in which case no more cloning is needed.
3376 // We need to be able to remove Edge from CallerEdges, so need to adjust
3377 // iterator inside the loop.
3378 for (auto EI = Node->CallerEdges.begin(); EI != Node->CallerEdges.end();) {
3379 auto CallerEdge = *EI;
3380
3381 // See if cloning the prior caller edge left this node with a single alloc
3382 // type or a single caller. In that case no more cloning of Node is needed.
3383 if (hasSingleAllocType(Node->AllocTypes) || Node->CallerEdges.size() <= 1)
3384 break;
3385
3386 // If the caller was not successfully matched to a call in the IR/summary,
3387 // there is no point in trying to clone for it as we can't update that call.
3388 if (!CallerEdge->Caller->hasCall()) {
3389 ++EI;
3390 continue;
3391 }
3392
3393 // Only need to process the ids along this edge pertaining to the given
3394 // allocation.
3395 auto CallerEdgeContextsForAlloc =
3396 set_intersection(CallerEdge->getContextIds(), AllocContextIds);
3397 if (CallerEdgeContextsForAlloc.empty()) {
3398 ++EI;
3399 continue;
3400 }
3401 auto CallerAllocTypeForAlloc = computeAllocType(CallerEdgeContextsForAlloc);
3402
3403 // Compute the node callee edge alloc types corresponding to the context ids
3404 // for this caller edge.
3405 std::vector<uint8_t> CalleeEdgeAllocTypesForCallerEdge;
3406 CalleeEdgeAllocTypesForCallerEdge.reserve(Node->CalleeEdges.size());
3407 for (auto &CalleeEdge : Node->CalleeEdges)
3408 CalleeEdgeAllocTypesForCallerEdge.push_back(intersectAllocTypes(
3409 CalleeEdge->getContextIds(), CallerEdgeContextsForAlloc));
3410
3411 // Don't clone if doing so will not disambiguate any alloc types amongst
3412 // caller edges (including the callee edges that would be cloned).
3413 // Otherwise we will simply move all edges to the clone.
3414 //
3415 // First check if by cloning we will disambiguate the caller allocation
3416 // type from node's allocation type. Query allocTypeToUse so that we don't
3417 // bother cloning to distinguish NotCold+Cold from NotCold. Note that
3418 // neither of these should be None type.
3419 //
3420 // Then check if by cloning node at least one of the callee edges will be
3421 // disambiguated by splitting out different context ids.
3422 assert(CallerEdge->AllocTypes != (uint8_t)AllocationType::None);
3423 assert(Node->AllocTypes != (uint8_t)AllocationType::None);
3424 if (allocTypeToUse(CallerAllocTypeForAlloc) ==
3425 allocTypeToUse(Node->AllocTypes) &&
3426 allocTypesMatch<DerivedCCG, FuncTy, CallTy>(
3427 CalleeEdgeAllocTypesForCallerEdge, Node->CalleeEdges)) {
3428 ++EI;
3429 continue;
3430 }
3431
3432 // First see if we can use an existing clone. Check each clone and its
3433 // callee edges for matching alloc types.
3434 ContextNode *Clone = nullptr;
3435 for (auto *CurClone : Node->Clones) {
3436 if (allocTypeToUse(CurClone->AllocTypes) !=
3437 allocTypeToUse(CallerAllocTypeForAlloc))
3438 continue;
3439
3440 bool BothSingleAlloc = hasSingleAllocType(CurClone->AllocTypes) &&
3441 hasSingleAllocType(CallerAllocTypeForAlloc);
3442 // The above check should mean that if both have single alloc types that
3443 // they should be equal.
3444 assert(!BothSingleAlloc ||
3445 CurClone->AllocTypes == CallerAllocTypeForAlloc);
3446
3447 // If either both have a single alloc type (which are the same), or if the
3448 // clone's callee edges have the same alloc types as those for the current
3449 // allocation on Node's callee edges (CalleeEdgeAllocTypesForCallerEdge),
3450 // then we can reuse this clone.
3451 if (BothSingleAlloc || allocTypesMatchClone<DerivedCCG, FuncTy, CallTy>(
3452 CalleeEdgeAllocTypesForCallerEdge, CurClone)) {
3453 Clone = CurClone;
3454 break;
3455 }
3456 }
3457
3458 // The edge iterator is adjusted when we move the CallerEdge to the clone.
3459 if (Clone)
3460 moveEdgeToExistingCalleeClone(CallerEdge, Clone, &EI, /*NewClone=*/false,
3461 CallerEdgeContextsForAlloc);
3462 else
3463 Clone =
3464 moveEdgeToNewCalleeClone(CallerEdge, &EI, CallerEdgeContextsForAlloc);
3465
3466 assert(EI == Node->CallerEdges.end() ||
3467 Node->AllocTypes != (uint8_t)AllocationType::None);
3468 // Sanity check that no alloc types on clone or its edges are None.
3469 assert(Clone->AllocTypes != (uint8_t)AllocationType::None);
3470 }
3471
3472 // We should still have some context ids on the original Node.
3473 assert(!Node->emptyContextIds());
3474
3475 // Sanity check that no alloc types on node or edges are None.
3476 assert(Node->AllocTypes != (uint8_t)AllocationType::None);
3477
3478 if (VerifyNodes)
3479 checkNode<DerivedCCG, FuncTy, CallTy>(Node, /*CheckEdges=*/false);
3480}
3481
3482void ModuleCallsiteContextGraph::updateAllocationCall(
3484 std::string AllocTypeString = getAllocTypeAttributeString(AllocType);
3485 auto A = llvm::Attribute::get(Call.call()->getFunction()->getContext(),
3486 "memprof", AllocTypeString);
3487 cast<CallBase>(Call.call())->addFnAttr(A);
3488 OREGetter(Call.call()->getFunction())
3489 .emit(OptimizationRemark(DEBUG_TYPE, "MemprofAttribute", Call.call())
3490 << ore::NV("AllocationCall", Call.call()) << " in clone "
3491 << ore::NV("Caller", Call.call()->getFunction())
3492 << " marked with memprof allocation attribute "
3493 << ore::NV("Attribute", AllocTypeString));
3494}
3495
3496void IndexCallsiteContextGraph::updateAllocationCall(CallInfo &Call,
3498 auto *AI = Call.call().dyn_cast<AllocInfo *>();
3499 assert(AI);
3500 assert(AI->Versions.size() > Call.cloneNo());
3501 AI->Versions[Call.cloneNo()] = (uint8_t)AllocType;
3502}
3503
3505ModuleCallsiteContextGraph::getAllocationCallType(const CallInfo &Call) const {
3506 const auto *CB = cast<CallBase>(Call.call());
3507 if (!CB->getAttributes().hasFnAttr("memprof"))
3508 return AllocationType::None;
3509 return CB->getAttributes().getFnAttr("memprof").getValueAsString() == "cold"
3512}
3513
3515IndexCallsiteContextGraph::getAllocationCallType(const CallInfo &Call) const {
3516 const auto *AI = Call.call().dyn_cast<AllocInfo *>();
3517 assert(AI->Versions.size() > Call.cloneNo());
3518 return (AllocationType)AI->Versions[Call.cloneNo()];
3519}
3520
3521void ModuleCallsiteContextGraph::updateCall(CallInfo &CallerCall,
3522 FuncInfo CalleeFunc) {
3523 if (CalleeFunc.cloneNo() > 0)
3524 cast<CallBase>(CallerCall.call())->setCalledFunction(CalleeFunc.func());
3525 OREGetter(CallerCall.call()->getFunction())
3526 .emit(OptimizationRemark(DEBUG_TYPE, "MemprofCall", CallerCall.call())
3527 << ore::NV("Call", CallerCall.call()) << " in clone "
3528 << ore::NV("Caller", CallerCall.call()->getFunction())
3529 << " assigned to call function clone "
3530 << ore::NV("Callee", CalleeFunc.func()));
3531}
3532
3533void IndexCallsiteContextGraph::updateCall(CallInfo &CallerCall,
3534 FuncInfo CalleeFunc) {
3535 auto *CI = CallerCall.call().dyn_cast<CallsiteInfo *>();
3536 assert(CI &&
3537 "Caller cannot be an allocation which should not have profiled calls");
3538 assert(CI->Clones.size() > CallerCall.cloneNo());
3539 CI->Clones[CallerCall.cloneNo()] = CalleeFunc.cloneNo();
3540}
3541
3542CallsiteContextGraph<ModuleCallsiteContextGraph, Function,
3543 Instruction *>::FuncInfo
3544ModuleCallsiteContextGraph::cloneFunctionForCallsite(
3545 FuncInfo &Func, CallInfo &Call, std::map<CallInfo, CallInfo> &CallMap,
3546 std::vector<CallInfo> &CallsWithMetadataInFunc, unsigned CloneNo) {
3547 // Use existing LLVM facilities for cloning and obtaining Call in clone
3548 ValueToValueMapTy VMap;
3549 auto *NewFunc = CloneFunction(Func.func(), VMap);
3550 std::string Name = getMemProfFuncName(Func.func()->getName(), CloneNo);
3551 assert(!Func.func()->getParent()->getFunction(Name));
3552 NewFunc->setName(Name);
3553 for (auto &Inst : CallsWithMetadataInFunc) {
3554 // This map always has the initial version in it.
3555 assert(Inst.cloneNo() == 0);
3556 CallMap[Inst] = {cast<Instruction>(VMap[Inst.call()]), CloneNo};
3557 }
3558 OREGetter(Func.func())
3559 .emit(OptimizationRemark(DEBUG_TYPE, "MemprofClone", Func.func())
3560 << "created clone " << ore::NV("NewFunction", NewFunc));
3561 return {NewFunc, CloneNo};
3562}
3563
3564CallsiteContextGraph<IndexCallsiteContextGraph, FunctionSummary,
3565 IndexCall>::FuncInfo
3566IndexCallsiteContextGraph::cloneFunctionForCallsite(
3567 FuncInfo &Func, CallInfo &Call, std::map<CallInfo, CallInfo> &CallMap,
3568 std::vector<CallInfo> &CallsWithMetadataInFunc, unsigned CloneNo) {
3569 // Check how many clones we have of Call (and therefore function).
3570 // The next clone number is the current size of versions array.
3571 // Confirm this matches the CloneNo provided by the caller, which is based on
3572 // the number of function clones we have.
3573 assert(CloneNo ==
3574 (Call.call().is<AllocInfo *>()
3575 ? Call.call().dyn_cast<AllocInfo *>()->Versions.size()
3576 : Call.call().dyn_cast<CallsiteInfo *>()->Clones.size()));
3577 // Walk all the instructions in this function. Create a new version for
3578 // each (by adding an entry to the Versions/Clones summary array), and copy
3579 // over the version being called for the function clone being cloned here.
3580 // Additionally, add an entry to the CallMap for the new function clone,
3581 // mapping the original call (clone 0, what is in CallsWithMetadataInFunc)
3582 // to the new call clone.
3583 for (auto &Inst : CallsWithMetadataInFunc) {
3584 // This map always has the initial version in it.
3585 assert(Inst.cloneNo() == 0);
3586 if (auto *AI = Inst.call().dyn_cast<AllocInfo *>()) {
3587 assert(AI->Versions.size() == CloneNo);
3588 // We assign the allocation type later (in updateAllocationCall), just add
3589 // an entry for it here.
3590 AI->Versions.push_back(0);
3591 } else {
3592 auto *CI = Inst.call().dyn_cast<CallsiteInfo *>();
3593 assert(CI && CI->Clones.size() == CloneNo);
3594 // We assign the clone number later (in updateCall), just add an entry for
3595 // it here.
3596 CI->Clones.push_back(0);
3597 }
3598 CallMap[Inst] = {Inst.call(), CloneNo};
3599 }
3600 return {Func.func(), CloneNo};
3601}
3602
3603// This method assigns cloned callsites to functions, cloning the functions as
3604// needed. The assignment is greedy and proceeds roughly as follows:
3605//
3606// For each function Func:
3607// For each call with graph Node having clones:
3608// Initialize ClonesWorklist to Node and its clones
3609// Initialize NodeCloneCount to 0
3610// While ClonesWorklist is not empty:
3611// Clone = pop front ClonesWorklist
3612// NodeCloneCount++
3613// If Func has been cloned less than NodeCloneCount times:
3614// If NodeCloneCount is 1:
3615// Assign Clone to original Func
3616// Continue
3617// Create a new function clone
3618// If other callers not assigned to call a function clone yet:
3619// Assign them to call new function clone
3620// Continue
3621// Assign any other caller calling the cloned version to new clone
3622//
3623// For each caller of Clone:
3624// If caller is assigned to call a specific function clone:
3625// If we cannot assign Clone to that function clone:
3626// Create new callsite Clone NewClone
3627// Add NewClone to ClonesWorklist
3628// Continue
3629// Assign Clone to existing caller's called function clone
3630// Else:
3631// If Clone not already assigned to a function clone:
3632// Assign to first function clone without assignment
3633// Assign caller to selected function clone
3634template <typename DerivedCCG, typename FuncTy, typename CallTy>
3635bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::assignFunctions() {
3636 bool Changed = false;
3637
3638 // Keep track of the assignment of nodes (callsites) to function clones they
3639 // call.
3640 DenseMap<ContextNode *, FuncInfo> CallsiteToCalleeFuncCloneMap;
3641
3642 // Update caller node to call function version CalleeFunc, by recording the
3643 // assignment in CallsiteToCalleeFuncCloneMap.
3644 auto RecordCalleeFuncOfCallsite = [&](ContextNode *Caller,
3645 const FuncInfo &CalleeFunc) {
3646 assert(Caller->hasCall());
3647 CallsiteToCalleeFuncCloneMap[Caller] = CalleeFunc;
3648 };
3649
3650 // Walk all functions for which we saw calls with memprof metadata, and handle
3651 // cloning for each of its calls.
3652 for (auto &[Func, CallsWithMetadata] : FuncToCallsWithMetadata) {
3653 FuncInfo OrigFunc(Func);
3654 // Map from each clone of OrigFunc to a map of remappings of each call of
3655 // interest (from original uncloned call to the corresponding cloned call in
3656 // that function clone).
3657 std::map<FuncInfo, std::map<CallInfo, CallInfo>> FuncClonesToCallMap;
3658 for (auto &Call : CallsWithMetadata) {
3659 ContextNode *Node = getNodeForInst(Call);
3660 // Skip call if we do not have a node for it (all uses of its stack ids
3661 // were either on inlined chains or pruned from the MIBs), or if we did
3662 // not create any clones for it.
3663 if (!Node || Node->Clones.empty())
3664 continue;
3665 assert(Node->hasCall() &&
3666 "Not having a call should have prevented cloning");
3667
3668 // Track the assignment of function clones to clones of the current
3669 // callsite Node being handled.
3670 std::map<FuncInfo, ContextNode *> FuncCloneToCurNodeCloneMap;
3671
3672 // Assign callsite version CallsiteClone to function version FuncClone,
3673 // and also assign (possibly cloned) Call to CallsiteClone.
3674 auto AssignCallsiteCloneToFuncClone = [&](const FuncInfo &FuncClone,
3675 CallInfo &Call,
3676 ContextNode *CallsiteClone,
3677 bool IsAlloc) {
3678 // Record the clone of callsite node assigned to this function clone.
3679 FuncCloneToCurNodeCloneMap[FuncClone] = CallsiteClone;
3680
3681 assert(FuncClonesToCallMap.count(FuncClone));
3682 std::map<CallInfo, CallInfo> &CallMap = FuncClonesToCallMap[FuncClone];
3683 CallInfo CallClone(Call);
3684 if (CallMap.count(Call))
3685 CallClone = CallMap[Call];
3686 CallsiteClone->setCall(CallClone);
3687 // Need to do the same for all matching calls.
3688 for (auto &MatchingCall : Node->MatchingCalls) {
3689 CallInfo CallClone(MatchingCall);
3690 if (CallMap.count(MatchingCall))
3691 CallClone = CallMap[MatchingCall];
3692 // Updates the call in the list.
3693 MatchingCall = CallClone;
3694 }
3695 };
3696
3697 // Keep track of the clones of callsite Node that need to be assigned to
3698 // function clones. This list may be expanded in the loop body below if we
3699 // find additional cloning is required.
3700 std::deque<ContextNode *> ClonesWorklist;
3701 // Ignore original Node if we moved all of its contexts to clones.
3702 if (!Node->emptyContextIds())
3703 ClonesWorklist.push_back(Node);
3704 ClonesWorklist.insert(ClonesWorklist.end(), Node->Clones.begin(),
3705 Node->Clones.end());
3706
3707 // Now walk through all of the clones of this callsite Node that we need,
3708 // and determine the assignment to a corresponding clone of the current
3709 // function (creating new function clones as needed).
3710 unsigned NodeCloneCount = 0;
3711 while (!ClonesWorklist.empty()) {
3712 ContextNode *Clone = ClonesWorklist.front();
3713 ClonesWorklist.pop_front();
3714 NodeCloneCount++;
3715 if (VerifyNodes)
3716 checkNode<DerivedCCG, FuncTy, CallTy>(Clone);
3717
3718 // Need to create a new function clone if we have more callsite clones
3719 // than existing function clones, which would have been assigned to an
3720 // earlier clone in the list (we assign callsite clones to function
3721 // clones greedily).
3722 if (FuncClonesToCallMap.size() < NodeCloneCount) {
3723 // If this is the first callsite copy, assign to original function.
3724 if (NodeCloneCount == 1) {
3725 // Since FuncClonesToCallMap is empty in this case, no clones have
3726 // been created for this function yet, and no callers should have
3727 // been assigned a function clone for this callee node yet.
3729 Clone->CallerEdges, [&](const std::shared_ptr<ContextEdge> &E) {
3730 return CallsiteToCalleeFuncCloneMap.count(E->Caller);
3731 }));
3732 // Initialize with empty call map, assign Clone to original function
3733 // and its callers, and skip to the next clone.
3734 FuncClonesToCallMap[OrigFunc] = {};
3735 AssignCallsiteCloneToFuncClone(
3736 OrigFunc, Call, Clone,
3737 AllocationCallToContextNodeMap.count(Call));
3738 for (auto &CE : Clone->CallerEdges) {
3739 // Ignore any caller that does not have a recorded callsite Call.
3740 if (!CE->Caller->hasCall())
3741 continue;
3742 RecordCalleeFuncOfCallsite(CE->Caller, OrigFunc);
3743 }
3744 continue;
3745 }
3746
3747 // First locate which copy of OrigFunc to clone again. If a caller
3748 // of this callsite clone was already assigned to call a particular
3749 // function clone, we need to redirect all of those callers to the
3750 // new function clone, and update their other callees within this
3751 // function.
3752 FuncInfo PreviousAssignedFuncClone;
3753 auto EI = llvm::find_if(
3754 Clone->CallerEdges, [&](const std::shared_ptr<ContextEdge> &E) {
3755 return CallsiteToCalleeFuncCloneMap.count(E->Caller);
3756 });
3757 bool CallerAssignedToCloneOfFunc = false;
3758 if (EI != Clone->CallerEdges.end()) {
3759 const std::shared_ptr<ContextEdge> &Edge = *EI;
3760 PreviousAssignedFuncClone =
3761 CallsiteToCalleeFuncCloneMap[Edge->Caller];
3762 CallerAssignedToCloneOfFunc = true;
3763 }
3764
3765 // Clone function and save it along with the CallInfo map created
3766 // during cloning in the FuncClonesToCallMap.
3767 std::map<CallInfo, CallInfo> NewCallMap;
3768 unsigned CloneNo = FuncClonesToCallMap.size();
3769 assert(CloneNo > 0 && "Clone 0 is the original function, which "
3770 "should already exist in the map");
3771 FuncInfo NewFuncClone = cloneFunctionForCallsite(
3772 OrigFunc, Call, NewCallMap, CallsWithMetadata, CloneNo);
3773 FuncClonesToCallMap.emplace(NewFuncClone, std::move(NewCallMap));
3774 FunctionClonesAnalysis++;
3775 Changed = true;
3776
3777 // If no caller callsites were already assigned to a clone of this
3778 // function, we can simply assign this clone to the new func clone
3779 // and update all callers to it, then skip to the next clone.
3780 if (!CallerAssignedToCloneOfFunc) {
3781 AssignCallsiteCloneToFuncClone(
3782 NewFuncClone, Call, Clone,
3783 AllocationCallToContextNodeMap.count(Call));
3784 for (auto &CE : Clone->CallerEdges) {
3785 // Ignore any caller that does not have a recorded callsite Call.
3786 if (!CE->Caller->hasCall())
3787 continue;
3788 RecordCalleeFuncOfCallsite(CE->Caller, NewFuncClone);
3789 }
3790 continue;
3791 }
3792
3793 // We may need to do additional node cloning in this case.
3794 // Reset the CallsiteToCalleeFuncCloneMap entry for any callers
3795 // that were previously assigned to call PreviousAssignedFuncClone,
3796 // to record that they now call NewFuncClone.
3797 // The none type edge removal may remove some of this Clone's caller
3798 // edges, if it is reached via another of its caller's callees.
3799 // Iterate over a copy and skip any that were removed.
3800 auto CallerEdges = Clone->CallerEdges;
3801 for (auto CE : CallerEdges) {
3802 // Skip any that have been removed on an earlier iteration.
3803 if (CE->isRemoved()) {
3804 assert(!is_contained(Clone->CallerEdges, CE));
3805 continue;
3806 }
3807 assert(CE);
3808 // Ignore any caller that does not have a recorded callsite Call.
3809 if (!CE->Caller->hasCall())
3810 continue;
3811
3812 if (!CallsiteToCalleeFuncCloneMap.count(CE->Caller) ||
3813 // We subsequently fall through to later handling that
3814 // will perform any additional cloning required for
3815 // callers that were calling other function clones.
3816 CallsiteToCalleeFuncCloneMap[CE->Caller] !=
3817 PreviousAssignedFuncClone)
3818 continue;
3819
3820 RecordCalleeFuncOfCallsite(CE->Caller, NewFuncClone);
3821
3822 // If we are cloning a function that was already assigned to some
3823 // callers, then essentially we are creating new callsite clones
3824 // of the other callsites in that function that are reached by those
3825 // callers. Clone the other callees of the current callsite's caller
3826 // that were already assigned to PreviousAssignedFuncClone
3827 // accordingly. This is important since we subsequently update the
3828 // calls from the nodes in the graph and their assignments to callee
3829 // functions recorded in CallsiteToCalleeFuncCloneMap.
3830 // The none type edge removal may remove some of this caller's
3831 // callee edges, if it is reached via another of its callees.
3832 // Iterate over a copy and skip any that were removed.
3833 auto CalleeEdges = CE->Caller->CalleeEdges;
3834 for (auto CalleeEdge : CalleeEdges) {
3835 // Skip any that have been removed on an earlier iteration when
3836 // cleaning up newly None type callee edges.
3837 if (CalleeEdge->isRemoved()) {
3838 assert(!is_contained(CE->Caller->CalleeEdges, CalleeEdge));
3839 continue;
3840 }
3841 assert(CalleeEdge);
3842 ContextNode *Callee = CalleeEdge->Callee;
3843 // Skip the current callsite, we are looking for other
3844 // callsites Caller calls, as well as any that does not have a
3845 // recorded callsite Call.
3846 if (Callee == Clone || !Callee->hasCall())
3847 continue;
3848 ContextNode *NewClone = moveEdgeToNewCalleeClone(CalleeEdge);
3849 removeNoneTypeCalleeEdges(NewClone);
3850 // Moving the edge may have resulted in some none type
3851 // callee edges on the original Callee.
3852 removeNoneTypeCalleeEdges(Callee);
3853 assert(NewClone->AllocTypes != (uint8_t)AllocationType::None);
3854 // If the Callee node was already assigned to call a specific
3855 // function version, make sure its new clone is assigned to call
3856 // that same function clone.
3857 if (CallsiteToCalleeFuncCloneMap.count(Callee))
3858 RecordCalleeFuncOfCallsite(
3859 NewClone, CallsiteToCalleeFuncCloneMap[Callee]);
3860 // Update NewClone with the new Call clone of this callsite's Call
3861 // created for the new function clone created earlier.
3862 // Recall that we have already ensured when building the graph
3863 // that each caller can only call callsites within the same
3864 // function, so we are guaranteed that Callee Call is in the
3865 // current OrigFunc.
3866 // CallMap is set up as indexed by original Call at clone 0.
3867 CallInfo OrigCall(Callee->getOrigNode()->Call);
3868 OrigCall.setCloneNo(0);
3869 std::map<CallInfo, CallInfo> &CallMap =
3870 FuncClonesToCallMap[NewFuncClone];
3871 assert(CallMap.count(OrigCall));
3872 CallInfo NewCall(CallMap[OrigCall]);
3873 assert(NewCall);
3874 NewClone->setCall(NewCall);
3875 // Need to do the same for all matching calls.
3876 for (auto &MatchingCall : NewClone->MatchingCalls) {
3877 CallInfo OrigMatchingCall(MatchingCall);
3878 OrigMatchingCall.setCloneNo(0);
3879 assert(CallMap.count(OrigMatchingCall));
3880 CallInfo NewCall(CallMap[OrigMatchingCall]);
3881 assert(NewCall);
3882 // Updates the call in the list.
3883 MatchingCall = NewCall;
3884 }
3885 }
3886 }
3887 // Fall through to handling below to perform the recording of the
3888 // function for this callsite clone. This enables handling of cases
3889 // where the callers were assigned to different clones of a function.
3890 }
3891
3892 // See if we can use existing function clone. Walk through
3893 // all caller edges to see if any have already been assigned to
3894 // a clone of this callsite's function. If we can use it, do so. If not,
3895 // because that function clone is already assigned to a different clone
3896 // of this callsite, then we need to clone again.
3897 // Basically, this checking is needed to handle the case where different
3898 // caller functions/callsites may need versions of this function
3899 // containing different mixes of callsite clones across the different
3900 // callsites within the function. If that happens, we need to create
3901 // additional function clones to handle the various combinations.
3902 //
3903 // Keep track of any new clones of this callsite created by the
3904 // following loop, as well as any existing clone that we decided to
3905 // assign this clone to.
3906 std::map<FuncInfo, ContextNode *> FuncCloneToNewCallsiteCloneMap;
3907 FuncInfo FuncCloneAssignedToCurCallsiteClone;
3908 // We need to be able to remove Edge from CallerEdges, so need to adjust
3909 // iterator in the loop.
3910 for (auto EI = Clone->CallerEdges.begin();
3911 EI != Clone->CallerEdges.end();) {
3912 auto Edge = *EI;
3913 // Ignore any caller that does not have a recorded callsite Call.
3914 if (!Edge->Caller->hasCall()) {
3915 EI++;
3916 continue;
3917 }
3918 // If this caller already assigned to call a version of OrigFunc, need
3919 // to ensure we can assign this callsite clone to that function clone.
3920 if (CallsiteToCalleeFuncCloneMap.count(Edge->Caller)) {
3921 FuncInfo FuncCloneCalledByCaller =
3922 CallsiteToCalleeFuncCloneMap[Edge->Caller];
3923 // First we need to confirm that this function clone is available
3924 // for use by this callsite node clone.
3925 //
3926 // While FuncCloneToCurNodeCloneMap is built only for this Node and
3927 // its callsite clones, one of those callsite clones X could have
3928 // been assigned to the same function clone called by Edge's caller
3929 // - if Edge's caller calls another callsite within Node's original
3930 // function, and that callsite has another caller reaching clone X.
3931 // We need to clone Node again in this case.
3932 if ((FuncCloneToCurNodeCloneMap.count(FuncCloneCalledByCaller) &&
3933 FuncCloneToCurNodeCloneMap[FuncCloneCalledByCaller] !=
3934 Clone) ||
3935 // Detect when we have multiple callers of this callsite that
3936 // have already been assigned to specific, and different, clones
3937 // of OrigFunc (due to other unrelated callsites in Func they
3938 // reach via call contexts). Is this Clone of callsite Node
3939 // assigned to a different clone of OrigFunc? If so, clone Node
3940 // again.
3941 (FuncCloneAssignedToCurCallsiteClone &&
3942 FuncCloneAssignedToCurCallsiteClone !=
3943 FuncCloneCalledByCaller)) {
3944 // We need to use a different newly created callsite clone, in
3945 // order to assign it to another new function clone on a
3946 // subsequent iteration over the Clones array (adjusted below).
3947 // Note we specifically do not reset the
3948 // CallsiteToCalleeFuncCloneMap entry for this caller, so that
3949 // when this new clone is processed later we know which version of
3950 // the function to copy (so that other callsite clones we have
3951 // assigned to that function clone are properly cloned over). See
3952 // comments in the function cloning handling earlier.
3953
3954 // Check if we already have cloned this callsite again while
3955 // walking through caller edges, for a caller calling the same
3956 // function clone. If so, we can move this edge to that new clone
3957 // rather than creating yet another new clone.
3958 if (FuncCloneToNewCallsiteCloneMap.count(
3959 FuncCloneCalledByCaller)) {
3960 ContextNode *NewClone =
3961 FuncCloneToNewCallsiteCloneMap[FuncCloneCalledByCaller];
3962 moveEdgeToExistingCalleeClone(Edge, NewClone, &EI);
3963 // Cleanup any none type edges cloned over.
3964 removeNoneTypeCalleeEdges(NewClone);
3965 } else {
3966 // Create a new callsite clone.
3967 ContextNode *NewClone = moveEdgeToNewCalleeClone(Edge, &EI);
3968 removeNoneTypeCalleeEdges(NewClone);
3969 FuncCloneToNewCallsiteCloneMap[FuncCloneCalledByCaller] =
3970 NewClone;
3971 // Add to list of clones and process later.
3972 ClonesWorklist.push_back(NewClone);
3973 assert(EI == Clone->CallerEdges.end() ||
3974 Clone->AllocTypes != (uint8_t)AllocationType::None);
3975 assert(NewClone->AllocTypes != (uint8_t)AllocationType::None);
3976 }
3977 // Moving the caller edge may have resulted in some none type
3978 // callee edges.
3979 removeNoneTypeCalleeEdges(Clone);
3980 // We will handle the newly created callsite clone in a subsequent
3981 // iteration over this Node's Clones. Continue here since we
3982 // already adjusted iterator EI while moving the edge.
3983 continue;
3984 }
3985
3986 // Otherwise, we can use the function clone already assigned to this
3987 // caller.
3988 if (!FuncCloneAssignedToCurCallsiteClone) {
3989 FuncCloneAssignedToCurCallsiteClone = FuncCloneCalledByCaller;
3990 // Assign Clone to FuncCloneCalledByCaller
3991 AssignCallsiteCloneToFuncClone(
3992 FuncCloneCalledByCaller, Call, Clone,
3993 AllocationCallToContextNodeMap.count(Call));
3994 } else
3995 // Don't need to do anything - callsite is already calling this
3996 // function clone.
3997 assert(FuncCloneAssignedToCurCallsiteClone ==
3998 FuncCloneCalledByCaller);
3999
4000 } else {
4001 // We have not already assigned this caller to a version of
4002 // OrigFunc. Do the assignment now.
4003
4004 // First check if we have already assigned this callsite clone to a
4005 // clone of OrigFunc for another caller during this iteration over
4006 // its caller edges.
4007 if (!FuncCloneAssignedToCurCallsiteClone) {
4008 // Find first function in FuncClonesToCallMap without an assigned
4009 // clone of this callsite Node. We should always have one
4010 // available at this point due to the earlier cloning when the
4011 // FuncClonesToCallMap size was smaller than the clone number.
4012 for (auto &CF : FuncClonesToCallMap) {
4013 if (!FuncCloneToCurNodeCloneMap.count(CF.first)) {
4014 FuncCloneAssignedToCurCallsiteClone = CF.first;
4015 break;
4016 }
4017 }
4018 assert(FuncCloneAssignedToCurCallsiteClone);
4019 // Assign Clone to FuncCloneAssignedToCurCallsiteClone
4020 AssignCallsiteCloneToFuncClone(
4021 FuncCloneAssignedToCurCallsiteClone, Call, Clone,
4022 AllocationCallToContextNodeMap.count(Call));
4023 } else
4024 assert(FuncCloneToCurNodeCloneMap
4025 [FuncCloneAssignedToCurCallsiteClone] == Clone);
4026 // Update callers to record function version called.
4027 RecordCalleeFuncOfCallsite(Edge->Caller,
4028 FuncCloneAssignedToCurCallsiteClone);
4029 }
4030
4031 EI++;
4032 }
4033 }
4034 if (VerifyCCG) {
4035 checkNode<DerivedCCG, FuncTy, CallTy>(Node);
4036 for (const auto &PE : Node->CalleeEdges)
4037 checkNode<DerivedCCG, FuncTy, CallTy>(PE->Callee);
4038 for (const auto &CE : Node->CallerEdges)
4039 checkNode<DerivedCCG, FuncTy, CallTy>(CE->Caller);
4040 for (auto *Clone : Node->Clones) {
4041 checkNode<DerivedCCG, FuncTy, CallTy>(Clone);
4042 for (const auto &PE : Clone->CalleeEdges)
4043 checkNode<DerivedCCG, FuncTy, CallTy>(PE->Callee);
4044 for (const auto &CE : Clone->CallerEdges)
4045 checkNode<DerivedCCG, FuncTy, CallTy>(CE->Caller);
4046 }
4047 }
4048 }
4049 }
4050
4051 uint8_t BothTypes =
4053
4054 auto UpdateCalls = [&](ContextNode *Node,
4056 auto &&UpdateCalls) {
4057 auto Inserted = Visited.insert(Node);
4058 if (!Inserted.second)
4059 return;
4060
4061 for (auto *Clone : Node->Clones)
4062 UpdateCalls(Clone, Visited, UpdateCalls);
4063
4064 for (auto &Edge : Node->CallerEdges)
4065 UpdateCalls(Edge->Caller, Visited, UpdateCalls);
4066
4067 // Skip if either no call to update, or if we ended up with no context ids
4068 // (we moved all edges onto other clones).
4069 if (!Node->hasCall() || Node->emptyContextIds())
4070 return;
4071
4072 if (Node->IsAllocation) {
4073 auto AT = allocTypeToUse(Node->AllocTypes);
4074 // If the allocation type is ambiguous, and more aggressive hinting
4075 // has been enabled via the MinClonedColdBytePercent flag, see if this
4076 // allocation should be hinted cold anyway because its fraction cold bytes
4077 // allocated is at least the given threshold.
4078 if (Node->AllocTypes == BothTypes && MinClonedColdBytePercent < 100 &&
4079 !ContextIdToContextSizeInfos.empty()) {
4080 uint64_t TotalCold = 0;
4081 uint64_t Total = 0;
4082 for (auto Id : Node->getContextIds()) {
4083 auto TypeI = ContextIdToAllocationType.find(Id);
4084 assert(TypeI != ContextIdToAllocationType.end());
4085 auto CSI = ContextIdToContextSizeInfos.find(Id);
4086 if (CSI != ContextIdToContextSizeInfos.end()) {
4087 for (auto &Info : CSI->second) {
4088 Total += Info.TotalSize;
4089 if (TypeI->second == AllocationType::Cold)
4090 TotalCold += Info.TotalSize;
4091 }
4092 }
4093 }
4094 if (TotalCold * 100 >= Total * MinClonedColdBytePercent)
4096 }
4097 updateAllocationCall(Node->Call, AT);
4098 assert(Node->MatchingCalls.empty());
4099 return;
4100 }
4101
4102 if (!CallsiteToCalleeFuncCloneMap.count(Node))
4103 return;
4104
4105 auto CalleeFunc = CallsiteToCalleeFuncCloneMap[Node];
4106 updateCall(Node->Call, CalleeFunc);
4107 // Update all the matching calls as well.
4108 for (auto &Call : Node->MatchingCalls)
4109 updateCall(Call, CalleeFunc);
4110 };
4111
4112 // Performs DFS traversal starting from allocation nodes to update calls to
4113 // reflect cloning decisions recorded earlier. For regular LTO this will
4114 // update the actual calls in the IR to call the appropriate function clone
4115 // (and add attributes to allocation calls), whereas for ThinLTO the decisions
4116 // are recorded in the summary entries.
4118 for (auto &Entry : AllocationCallToContextNodeMap)
4119 UpdateCalls(Entry.second, Visited, UpdateCalls);
4120
4121 return Changed;
4122}
4123
4125 Function &F, unsigned NumClones, Module &M, OptimizationRemarkEmitter &ORE,
4127 &FuncToAliasMap) {
4128 // The first "clone" is the original copy, we should only call this if we
4129 // needed to create new clones.
4130 assert(NumClones > 1);
4132 VMaps.reserve(NumClones - 1);
4133 FunctionsClonedThinBackend++;
4134 for (unsigned I = 1; I < NumClones; I++) {
4135 VMaps.emplace_back(std::make_unique<ValueToValueMapTy>());
4136 auto *NewF = CloneFunction(&F, *VMaps.back());
4137 FunctionClonesThinBackend++;
4138 // Strip memprof and callsite metadata from clone as they are no longer
4139 // needed.
4140 for (auto &BB : *NewF) {
4141 for (auto &Inst : BB) {
4142 Inst.setMetadata(LLVMContext::MD_memprof, nullptr);
4143 Inst.setMetadata(LLVMContext::MD_callsite, nullptr);
4144 }
4145 }
4146 std::string Name = getMemProfFuncName(F.getName(), I);
4147 auto *PrevF = M.getFunction(Name);
4148 if (PrevF) {
4149 // We might have created this when adjusting callsite in another
4150 // function. It should be a declaration.
4151 assert(PrevF->isDeclaration());
4152 NewF->takeName(PrevF);
4153 PrevF->replaceAllUsesWith(NewF);
4154 PrevF->eraseFromParent();
4155 } else
4156 NewF->setName(Name);
4157 ORE.emit(OptimizationRemark(DEBUG_TYPE, "MemprofClone", &F)
4158 << "created clone " << ore::NV("NewFunction", NewF));
4159
4160 // Now handle aliases to this function, and clone those as well.
4161 if (!FuncToAliasMap.count(&F))
4162 continue;
4163 for (auto *A : FuncToAliasMap[&F]) {
4164 std::string Name = getMemProfFuncName(A->getName(), I);
4165 auto *PrevA = M.getNamedAlias(Name);
4166 auto *NewA = GlobalAlias::create(A->getValueType(),
4167 A->getType()->getPointerAddressSpace(),
4168 A->getLinkage(), Name, NewF);
4169 NewA->copyAttributesFrom(A);
4170 if (PrevA) {
4171 // We might have created this when adjusting callsite in another
4172 // function. It should be a declaration.
4173 assert(PrevA->isDeclaration());
4174 NewA->takeName(PrevA);
4175 PrevA->replaceAllUsesWith(NewA);
4176 PrevA->eraseFromParent();
4177 }
4178 }
4179 }
4180 return VMaps;
4181}
4182
4183// Locate the summary for F. This is complicated by the fact that it might
4184// have been internalized or promoted.
4186 const ModuleSummaryIndex *ImportSummary) {
4187 // FIXME: Ideally we would retain the original GUID in some fashion on the
4188 // function (e.g. as metadata), but for now do our best to locate the
4189 // summary without that information.
4190 ValueInfo TheFnVI = ImportSummary->getValueInfo(F.getGUID());
4191 if (!TheFnVI)
4192 // See if theFn was internalized, by checking index directly with
4193 // original name (this avoids the name adjustment done by getGUID() for
4194 // internal symbols).
4195 TheFnVI = ImportSummary->getValueInfo(GlobalValue::getGUID(F.getName()));
4196 if (TheFnVI)
4197 return TheFnVI;
4198 // Now query with the original name before any promotion was performed.
4199 StringRef OrigName =
4201 std::string OrigId = GlobalValue::getGlobalIdentifier(
4202 OrigName, GlobalValue::InternalLinkage, M.getSourceFileName());
4203 TheFnVI = ImportSummary->getValueInfo(GlobalValue::getGUID(OrigId));
4204 if (TheFnVI)
4205 return TheFnVI;
4206 // Could be a promoted local imported from another module. We need to pass
4207 // down more info here to find the original module id. For now, try with
4208 // the OrigName which might have been stored in the OidGuidMap in the
4209 // index. This would not work if there were same-named locals in multiple
4210 // modules, however.
4211 auto OrigGUID =
4212 ImportSummary->getGUIDFromOriginalID(GlobalValue::getGUID(OrigName));
4213 if (OrigGUID)
4214 TheFnVI = ImportSummary->getValueInfo(OrigGUID);
4215 return TheFnVI;
4216}
4217
4218bool MemProfContextDisambiguation::initializeIndirectCallPromotionInfo(
4219 Module &M) {
4220 ICallAnalysis = std::make_unique<ICallPromotionAnalysis>();
4221 Symtab = std::make_unique<InstrProfSymtab>();
4222 // Don't add canonical names, to avoid multiple functions to the symtab
4223 // when they both have the same root name with "." suffixes stripped.
4224 // If we pick the wrong one then this could lead to incorrect ICP and calling
4225 // a memprof clone that we don't actually create (resulting in linker unsats).
4226 // What this means is that the GUID of the function (or its PGOFuncName
4227 // metadata) *must* match that in the VP metadata to allow promotion.
4228 // In practice this should not be a limitation, since local functions should
4229 // have PGOFuncName metadata and global function names shouldn't need any
4230 // special handling (they should not get the ".llvm.*" suffix that the
4231 // canonicalization handling is attempting to strip).
4232 if (Error E = Symtab->create(M, /*InLTO=*/true, /*AddCanonical=*/false)) {
4233 std::string SymtabFailure = toString(std::move(E));
4234 M.getContext().emitError("Failed to create symtab: " + SymtabFailure);
4235 return false;
4236 }
4237 return true;
4238}
4239
4240bool MemProfContextDisambiguation::applyImport(Module &M) {
4241 assert(ImportSummary);
4242 bool Changed = false;
4243
4244 // We also need to clone any aliases that reference cloned functions, because
4245 // the modified callsites may invoke via the alias. Keep track of the aliases
4246 // for each function.
4247 std::map<const Function *, SmallPtrSet<const GlobalAlias *, 1>>
4248 FuncToAliasMap;
4249 for (auto &A : M.aliases()) {
4250 auto *Aliasee = A.getAliaseeObject();
4251 if (auto *F = dyn_cast<Function>(Aliasee))
4252 FuncToAliasMap[F].insert(&A);
4253 }
4254
4255 if (!initializeIndirectCallPromotionInfo(M))
4256 return false;
4257
4258 for (auto &F : M) {
4259 if (F.isDeclaration() || isMemProfClone(F))
4260 continue;
4261
4263
4265 bool ClonesCreated = false;
4266 unsigned NumClonesCreated = 0;
4267 auto CloneFuncIfNeeded = [&](unsigned NumClones) {
4268 // We should at least have version 0 which is the original copy.
4269 assert(NumClones > 0);
4270 // If only one copy needed use original.
4271 if (NumClones == 1)
4272 return;
4273 // If we already performed cloning of this function, confirm that the
4274 // requested number of clones matches (the thin link should ensure the
4275 // number of clones for each constituent callsite is consistent within
4276 // each function), before returning.
4277 if (ClonesCreated) {
4278 assert(NumClonesCreated == NumClones);
4279 return;
4280 }
4281 VMaps = createFunctionClones(F, NumClones, M, ORE, FuncToAliasMap);
4282 // The first "clone" is the original copy, which doesn't have a VMap.
4283 assert(VMaps.size() == NumClones - 1);
4284 Changed = true;
4285 ClonesCreated = true;
4286 NumClonesCreated = NumClones;
4287 };
4288
4289 auto CloneCallsite = [&](const CallsiteInfo &StackNode, CallBase *CB,
4290 Function *CalledFunction) {
4291 // Perform cloning if not yet done.
4292 CloneFuncIfNeeded(/*NumClones=*/StackNode.Clones.size());
4293
4294 assert(!isMemProfClone(*CalledFunction));
4295
4296 // Update the calls per the summary info.
4297 // Save orig name since it gets updated in the first iteration
4298 // below.
4299 auto CalleeOrigName = CalledFunction->getName();
4300 for (unsigned J = 0; J < StackNode.Clones.size(); J++) {
4301 // Do nothing if this version calls the original version of its
4302 // callee.
4303 if (!StackNode.Clones[J])
4304 continue;
4305 auto NewF = M.getOrInsertFunction(
4306 getMemProfFuncName(CalleeOrigName, StackNode.Clones[J]),
4307 CalledFunction->getFunctionType());
4308 CallBase *CBClone;
4309 // Copy 0 is the original function.
4310 if (!J)
4311 CBClone = CB;
4312 else
4313 CBClone = cast<CallBase>((*VMaps[J - 1])[CB]);
4314 CBClone->setCalledFunction(NewF);
4315 ORE.emit(OptimizationRemark(DEBUG_TYPE, "MemprofCall", CBClone)
4316 << ore::NV("Call", CBClone) << " in clone "
4317 << ore::NV("Caller", CBClone->getFunction())
4318 << " assigned to call function clone "
4319 << ore::NV("Callee", NewF.getCallee()));
4320 }
4321 };
4322
4323 // Locate the summary for F.
4324 ValueInfo TheFnVI = findValueInfoForFunc(F, M, ImportSummary);
4325 // If not found, this could be an imported local (see comment in
4326 // findValueInfoForFunc). Skip for now as it will be cloned in its original
4327 // module (where it would have been promoted to global scope so should
4328 // satisfy any reference in this module).
4329 if (!TheFnVI)
4330 continue;
4331
4332 auto *GVSummary =
4333 ImportSummary->findSummaryInModule(TheFnVI, M.getModuleIdentifier());
4334 if (!GVSummary) {
4335 // Must have been imported, use the summary which matches the definition。
4336 // (might be multiple if this was a linkonce_odr).
4337 auto SrcModuleMD = F.getMetadata("thinlto_src_module");
4338 assert(SrcModuleMD &&
4339 "enable-import-metadata is needed to emit thinlto_src_module");
4340 StringRef SrcModule =
4341 dyn_cast<MDString>(SrcModuleMD->getOperand(0))->getString();
4342 for (auto &GVS : TheFnVI.getSummaryList()) {
4343 if (GVS->modulePath() == SrcModule) {
4344 GVSummary = GVS.get();
4345 break;
4346 }
4347 }
4348 assert(GVSummary && GVSummary->modulePath() == SrcModule);
4349 }
4350
4351 // If this was an imported alias skip it as we won't have the function
4352 // summary, and it should be cloned in the original module.
4353 if (isa<AliasSummary>(GVSummary))
4354 continue;
4355
4356 auto *FS = cast<FunctionSummary>(GVSummary->getBaseObject());
4357
4358 if (FS->allocs().empty() && FS->callsites().empty())
4359 continue;
4360
4361 auto SI = FS->callsites().begin();
4362 auto AI = FS->allocs().begin();
4363
4364 // To handle callsite infos synthesized for tail calls which have missing
4365 // frames in the profiled context, map callee VI to the synthesized callsite
4366 // info.
4367 DenseMap<ValueInfo, CallsiteInfo> MapTailCallCalleeVIToCallsite;
4368 // Iterate the callsites for this function in reverse, since we place all
4369 // those synthesized for tail calls at the end.
4370 for (auto CallsiteIt = FS->callsites().rbegin();
4371 CallsiteIt != FS->callsites().rend(); CallsiteIt++) {
4372 auto &Callsite = *CallsiteIt;
4373 // Stop as soon as we see a non-synthesized callsite info (see comment
4374 // above loop). All the entries added for discovered tail calls have empty
4375 // stack ids.
4376 if (!Callsite.StackIdIndices.empty())
4377 break;
4378 MapTailCallCalleeVIToCallsite.insert({Callsite.Callee, Callsite});
4379 }
4380
4381 // Keeps track of needed ICP for the function.
4382 SmallVector<ICallAnalysisData> ICallAnalysisInfo;
4383
4384 // Assume for now that the instructions are in the exact same order
4385 // as when the summary was created, but confirm this is correct by
4386 // matching the stack ids.
4387 for (auto &BB : F) {
4388 for (auto &I : BB) {
4389 auto *CB = dyn_cast<CallBase>(&I);
4390 // Same handling as when creating module summary.
4391 if (!mayHaveMemprofSummary(CB))
4392 continue;
4393
4394 auto *CalledValue = CB->getCalledOperand();
4395 auto *CalledFunction = CB->getCalledFunction();
4396 if (CalledValue && !CalledFunction) {
4397 CalledValue = CalledValue->stripPointerCasts();
4398 // Stripping pointer casts can reveal a called function.
4399 CalledFunction = dyn_cast<Function>(CalledValue);
4400 }
4401 // Check if this is an alias to a function. If so, get the
4402 // called aliasee for the checks below.
4403 if (auto *GA = dyn_cast<GlobalAlias>(CalledValue)) {
4404 assert(!CalledFunction &&
4405 "Expected null called function in callsite for alias");
4406 CalledFunction = dyn_cast<Function>(GA->getAliaseeObject());
4407 }
4408
4410 I.getMetadata(LLVMContext::MD_callsite));
4411 auto *MemProfMD = I.getMetadata(LLVMContext::MD_memprof);
4412
4413 // Include allocs that were already assigned a memprof function
4414 // attribute in the statistics.
4415 if (CB->getAttributes().hasFnAttr("memprof")) {
4416 assert(!MemProfMD);
4417 CB->getAttributes().getFnAttr("memprof").getValueAsString() == "cold"
4418 ? AllocTypeColdThinBackend++
4419 : AllocTypeNotColdThinBackend++;
4420 OrigAllocsThinBackend++;
4421 AllocVersionsThinBackend++;
4422 if (!MaxAllocVersionsThinBackend)
4423 MaxAllocVersionsThinBackend = 1;
4424 continue;
4425 }
4426
4427 if (MemProfMD) {
4428 // Consult the next alloc node.
4429 assert(AI != FS->allocs().end());
4430 auto &AllocNode = *(AI++);
4431
4432 // Sanity check that the MIB stack ids match between the summary and
4433 // instruction metadata.
4434 auto MIBIter = AllocNode.MIBs.begin();
4435 for (auto &MDOp : MemProfMD->operands()) {
4436 assert(MIBIter != AllocNode.MIBs.end());
4437 LLVM_ATTRIBUTE_UNUSED auto StackIdIndexIter =
4438 MIBIter->StackIdIndices.begin();
4439 auto *MIBMD = cast<const MDNode>(MDOp);
4440 MDNode *StackMDNode = getMIBStackNode(MIBMD);
4441 assert(StackMDNode);
4442 CallStack<MDNode, MDNode::op_iterator> StackContext(StackMDNode);
4443 auto ContextIterBegin =
4444 StackContext.beginAfterSharedPrefix(CallsiteContext);
4445 // Skip the checking on the first iteration.
4446 uint64_t LastStackContextId =
4447 (ContextIterBegin != StackContext.end() &&
4448 *ContextIterBegin == 0)
4449 ? 1
4450 : 0;
4451 for (auto ContextIter = ContextIterBegin;
4452 ContextIter != StackContext.end(); ++ContextIter) {
4453 // If this is a direct recursion, simply skip the duplicate
4454 // entries, to be consistent with how the summary ids were
4455 // generated during ModuleSummaryAnalysis.
4456 if (LastStackContextId == *ContextIter)
4457 continue;
4458 LastStackContextId = *ContextIter;
4459 assert(StackIdIndexIter != MIBIter->StackIdIndices.end());
4460 assert(ImportSummary->getStackIdAtIndex(*StackIdIndexIter) ==
4461 *ContextIter);
4462 StackIdIndexIter++;
4463 }
4464 MIBIter++;
4465 }
4466
4467 // Perform cloning if not yet done.
4468 CloneFuncIfNeeded(/*NumClones=*/AllocNode.Versions.size());
4469
4470 OrigAllocsThinBackend++;
4471 AllocVersionsThinBackend += AllocNode.Versions.size();
4472 if (MaxAllocVersionsThinBackend < AllocNode.Versions.size())
4473 MaxAllocVersionsThinBackend = AllocNode.Versions.size();
4474
4475 // If there is only one version that means we didn't end up
4476 // considering this function for cloning, and in that case the alloc
4477 // will still be none type or should have gotten the default NotCold.
4478 // Skip that after calling clone helper since that does some sanity
4479 // checks that confirm we haven't decided yet that we need cloning.
4480 // We might have a single version that is cold due to the
4481 // MinClonedColdBytePercent heuristic, make sure we don't skip in that
4482 // case.
4483 if (AllocNode.Versions.size() == 1 &&
4484 (AllocationType)AllocNode.Versions[0] != AllocationType::Cold) {
4485 assert((AllocationType)AllocNode.Versions[0] ==
4487 (AllocationType)AllocNode.Versions[0] ==
4489 UnclonableAllocsThinBackend++;
4490 continue;
4491 }
4492
4493 // All versions should have a singular allocation type.
4494 assert(llvm::none_of(AllocNode.Versions, [](uint8_t Type) {
4495 return Type == ((uint8_t)AllocationType::NotCold |
4496 (uint8_t)AllocationType::Cold);
4497 }));
4498
4499 // Update the allocation types per the summary info.
4500 for (unsigned J = 0; J < AllocNode.Versions.size(); J++) {
4501 // Ignore any that didn't get an assigned allocation type.
4502 if (AllocNode.Versions[J] == (uint8_t)AllocationType::None)
4503 continue;
4504 AllocationType AllocTy = (AllocationType)AllocNode.Versions[J];
4505 AllocTy == AllocationType::Cold ? AllocTypeColdThinBackend++
4506 : AllocTypeNotColdThinBackend++;
4507 std::string AllocTypeString = getAllocTypeAttributeString(AllocTy);
4508 auto A = llvm::Attribute::get(F.getContext(), "memprof",
4509 AllocTypeString);
4510 CallBase *CBClone;
4511 // Copy 0 is the original function.
4512 if (!J)
4513 CBClone = CB;
4514 else
4515 // Since VMaps are only created for new clones, we index with
4516 // clone J-1 (J==0 is the original clone and does not have a VMaps
4517 // entry).
4518 CBClone = cast<CallBase>((*VMaps[J - 1])[CB]);
4519 CBClone->addFnAttr(A);
4520 ORE.emit(OptimizationRemark(DEBUG_TYPE, "MemprofAttribute", CBClone)
4521 << ore::NV("AllocationCall", CBClone) << " in clone "
4522 << ore::NV("Caller", CBClone->getFunction())
4523 << " marked with memprof allocation attribute "
4524 << ore::NV("Attribute", AllocTypeString));
4525 }
4526 } else if (!CallsiteContext.empty()) {
4527 if (!CalledFunction) {
4528#ifndef NDEBUG
4529 // We should have skipped inline assembly calls.
4530 auto *CI = dyn_cast<CallInst>(CB);
4531 assert(!CI || !CI->isInlineAsm());
4532#endif
4533 // We should have skipped direct calls via a Constant.
4534 assert(CalledValue && !isa<Constant>(CalledValue));
4535
4536 // This is an indirect call, see if we have profile information and
4537 // whether any clones were recorded for the profiled targets (that
4538 // we synthesized CallsiteInfo summary records for when building the
4539 // index).
4540 auto NumClones =
4541 recordICPInfo(CB, FS->callsites(), SI, ICallAnalysisInfo);
4542
4543 // Perform cloning if not yet done. This is done here in case
4544 // we don't need to do ICP, but might need to clone this
4545 // function as it is the target of other cloned calls.
4546 if (NumClones)
4547 CloneFuncIfNeeded(NumClones);
4548 }
4549
4550 else {
4551 // Consult the next callsite node.
4552 assert(SI != FS->callsites().end());
4553 auto &StackNode = *(SI++);
4554
4555#ifndef NDEBUG
4556 // Sanity check that the stack ids match between the summary and
4557 // instruction metadata.
4558 auto StackIdIndexIter = StackNode.StackIdIndices.begin();
4559 for (auto StackId : CallsiteContext) {
4560 assert(StackIdIndexIter != StackNode.StackIdIndices.end());
4561 assert(ImportSummary->getStackIdAtIndex(*StackIdIndexIter) ==
4562 StackId);
4563 StackIdIndexIter++;
4564 }
4565#endif
4566
4567 CloneCallsite(StackNode, CB, CalledFunction);
4568 }
4569 } else if (CB->isTailCall() && CalledFunction) {
4570 // Locate the synthesized callsite info for the callee VI, if any was
4571 // created, and use that for cloning.
4572 ValueInfo CalleeVI =
4573 findValueInfoForFunc(*CalledFunction, M, ImportSummary);
4574 if (CalleeVI && MapTailCallCalleeVIToCallsite.count(CalleeVI)) {
4575 auto Callsite = MapTailCallCalleeVIToCallsite.find(CalleeVI);
4576 assert(Callsite != MapTailCallCalleeVIToCallsite.end());
4577 CloneCallsite(Callsite->second, CB, CalledFunction);
4578 }
4579 }
4580 }
4581 }
4582
4583 // Now do any promotion required for cloning.
4584 performICP(M, FS->callsites(), VMaps, ICallAnalysisInfo, ORE);
4585 }
4586
4587 // We skip some of the functions and instructions above, so remove all the
4588 // metadata in a single sweep here.
4589 for (auto &F : M) {
4590 // We can skip memprof clones because createFunctionClones already strips
4591 // the metadata from the newly created clones.
4592 if (F.isDeclaration() || isMemProfClone(F))
4593 continue;
4594 for (auto &BB : F) {
4595 for (auto &I : BB) {
4596 if (!isa<CallBase>(I))
4597 continue;
4598 I.setMetadata(LLVMContext::MD_memprof, nullptr);
4599 I.setMetadata(LLVMContext::MD_callsite, nullptr);
4600 }
4601 }
4602 }
4603
4604 return Changed;
4605}
4606
4607unsigned MemProfContextDisambiguation::recordICPInfo(
4608 CallBase *CB, ArrayRef<CallsiteInfo> AllCallsites,
4610 SmallVector<ICallAnalysisData> &ICallAnalysisInfo) {
4611 // First see if we have profile information for this indirect call.
4612 uint32_t NumCandidates;
4613 uint64_t TotalCount;
4614 auto CandidateProfileData =
4615 ICallAnalysis->getPromotionCandidatesForInstruction(CB, TotalCount,
4616 NumCandidates);
4617 if (CandidateProfileData.empty())
4618 return 0;
4619
4620 // Iterate through all of the candidate profiled targets along with the
4621 // CallsiteInfo summary records synthesized for them when building the index,
4622 // and see if any are cloned and/or refer to clones.
4623 bool ICPNeeded = false;
4624 unsigned NumClones = 0;
4625 size_t CallsiteInfoStartIndex = std::distance(AllCallsites.begin(), SI);
4626 for (const auto &Candidate : CandidateProfileData) {
4627#ifndef NDEBUG
4628 auto CalleeValueInfo =
4629#endif
4630 ImportSummary->getValueInfo(Candidate.Value);
4631 // We might not have a ValueInfo if this is a distributed
4632 // ThinLTO backend and decided not to import that function.
4633 assert(!CalleeValueInfo || SI->Callee == CalleeValueInfo);
4634 assert(SI != AllCallsites.end());
4635 auto &StackNode = *(SI++);
4636 // See if any of the clones of the indirect callsite for this
4637 // profiled target should call a cloned version of the profiled
4638 // target. We only need to do the ICP here if so.
4639 ICPNeeded |= llvm::any_of(StackNode.Clones,
4640 [](unsigned CloneNo) { return CloneNo != 0; });
4641 // Every callsite in the same function should have been cloned the same
4642 // number of times.
4643 assert(!NumClones || NumClones == StackNode.Clones.size());
4644 NumClones = StackNode.Clones.size();
4645 }
4646 if (!ICPNeeded)
4647 return NumClones;
4648 // Save information for ICP, which is performed later to avoid messing up the
4649 // current function traversal.
4650 ICallAnalysisInfo.push_back({CB, CandidateProfileData.vec(), NumCandidates,
4651 TotalCount, CallsiteInfoStartIndex});
4652 return NumClones;
4653}
4654
4655void MemProfContextDisambiguation::performICP(
4656 Module &M, ArrayRef<CallsiteInfo> AllCallsites,
4657 ArrayRef<std::unique_ptr<ValueToValueMapTy>> VMaps,
4658 ArrayRef<ICallAnalysisData> ICallAnalysisInfo,
4660 // Now do any promotion required for cloning. Specifically, for each
4661 // recorded ICP candidate (which was only recorded because one clone of that
4662 // candidate should call a cloned target), we perform ICP (speculative
4663 // devirtualization) for each clone of the callsite, and update its callee
4664 // to the appropriate clone. Note that the ICP compares against the original
4665 // version of the target, which is what is in the vtable.
4666 for (auto &Info : ICallAnalysisInfo) {
4667 auto *CB = Info.CB;
4668 auto CallsiteIndex = Info.CallsiteInfoStartIndex;
4669 auto TotalCount = Info.TotalCount;
4670 unsigned NumPromoted = 0;
4671 unsigned NumClones = 0;
4672
4673 for (auto &Candidate : Info.CandidateProfileData) {
4674 auto &StackNode = AllCallsites[CallsiteIndex++];
4675
4676 // All calls in the same function must have the same number of clones.
4677 assert(!NumClones || NumClones == StackNode.Clones.size());
4678 NumClones = StackNode.Clones.size();
4679
4680 // See if the target is in the module. If it wasn't imported, it is
4681 // possible that this profile could have been collected on a different
4682 // target (or version of the code), and we need to be conservative
4683 // (similar to what is done in the ICP pass).
4684 Function *TargetFunction = Symtab->getFunction(Candidate.Value);
4685 if (TargetFunction == nullptr ||
4686 // Any ThinLTO global dead symbol removal should have already
4687 // occurred, so it should be safe to promote when the target is a
4688 // declaration.
4689 // TODO: Remove internal option once more fully tested.
4691 TargetFunction->isDeclaration())) {
4692 ORE.emit([&]() {
4693 return OptimizationRemarkMissed(DEBUG_TYPE, "UnableToFindTarget", CB)
4694 << "Memprof cannot promote indirect call: target with md5sum "
4695 << ore::NV("target md5sum", Candidate.Value) << " not found";
4696 });
4697 // FIXME: See if we can use the new declaration importing support to
4698 // at least get the declarations imported for this case. Hot indirect
4699 // targets should have been imported normally, however.
4700 continue;
4701 }
4702
4703 // Check if legal to promote
4704 const char *Reason = nullptr;
4705 if (!isLegalToPromote(*CB, TargetFunction, &Reason)) {
4706 ORE.emit([&]() {
4707 return OptimizationRemarkMissed(DEBUG_TYPE, "UnableToPromote", CB)
4708 << "Memprof cannot promote indirect call to "
4709 << ore::NV("TargetFunction", TargetFunction)
4710 << " with count of " << ore::NV("TotalCount", TotalCount)
4711 << ": " << Reason;
4712 });
4713 continue;
4714 }
4715
4716 assert(!isMemProfClone(*TargetFunction));
4717
4718 // Handle each call clone, applying ICP so that each clone directly
4719 // calls the specified callee clone, guarded by the appropriate ICP
4720 // check.
4721 CallBase *CBClone = CB;
4722 for (unsigned J = 0; J < NumClones; J++) {
4723 // Copy 0 is the original function.
4724 if (J > 0)
4725 CBClone = cast<CallBase>((*VMaps[J - 1])[CB]);
4726 // We do the promotion using the original name, so that the comparison
4727 // is against the name in the vtable. Then just below, change the new
4728 // direct call to call the cloned function.
4729 auto &DirectCall =
4730 pgo::promoteIndirectCall(*CBClone, TargetFunction, Candidate.Count,
4731 TotalCount, isSamplePGO, &ORE);
4732 auto *TargetToUse = TargetFunction;
4733 // Call original if this version calls the original version of its
4734 // callee.
4735 if (StackNode.Clones[J]) {
4736 TargetToUse =
4737 cast<Function>(M.getOrInsertFunction(
4738 getMemProfFuncName(TargetFunction->getName(),
4739 StackNode.Clones[J]),
4740 TargetFunction->getFunctionType())
4741 .getCallee());
4742 }
4743 DirectCall.setCalledFunction(TargetToUse);
4744 ORE.emit(OptimizationRemark(DEBUG_TYPE, "MemprofCall", CBClone)
4745 << ore::NV("Call", CBClone) << " in clone "
4746 << ore::NV("Caller", CBClone->getFunction())
4747 << " promoted and assigned to call function clone "
4748 << ore::NV("Callee", TargetToUse));
4749 }
4750
4751 // Update TotalCount (all clones should get same count above)
4752 TotalCount -= Candidate.Count;
4753 NumPromoted++;
4754 }
4755 // Adjust the MD.prof metadata for all clones, now that we have the new
4756 // TotalCount and the number promoted.
4757 CallBase *CBClone = CB;
4758 for (unsigned J = 0; J < NumClones; J++) {
4759 // Copy 0 is the original function.
4760 if (J > 0)
4761 CBClone = cast<CallBase>((*VMaps[J - 1])[CB]);
4762 // First delete the old one.
4763 CBClone->setMetadata(LLVMContext::MD_prof, nullptr);
4764 // If all promoted, we don't need the MD.prof metadata.
4765 // Otherwise we need update with the un-promoted records back.
4766 if (TotalCount != 0)
4768 M, *CBClone, ArrayRef(Info.CandidateProfileData).slice(NumPromoted),
4769 TotalCount, IPVK_IndirectCallTarget, Info.NumCandidates);
4770 }
4771 }
4772}
4773
4774template <typename DerivedCCG, typename FuncTy, typename CallTy>
4775bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::process() {
4776 if (DumpCCG) {
4777 dbgs() << "CCG before cloning:\n";
4778 dbgs() << *this;
4779 }
4780 if (ExportToDot)
4781 exportToDot("postbuild");
4782
4783 if (VerifyCCG) {
4784 check();
4785 }
4786
4787 identifyClones();
4788
4789 if (VerifyCCG) {
4790 check();
4791 }
4792
4793 if (DumpCCG) {
4794 dbgs() << "CCG after cloning:\n";
4795 dbgs() << *this;
4796 }
4797 if (ExportToDot)
4798 exportToDot("cloned");
4799
4800 bool Changed = assignFunctions();
4801
4802 if (DumpCCG) {
4803 dbgs() << "CCG after assigning function clones:\n";
4804 dbgs() << *this;
4805 }
4806 if (ExportToDot)
4807 exportToDot("clonefuncassign");
4808
4810 printTotalSizes(errs());
4811
4812 return Changed;
4813}
4814
4815bool MemProfContextDisambiguation::processModule(
4816 Module &M,
4818
4819 // If we have an import summary, then the cloning decisions were made during
4820 // the thin link on the index. Apply them and return.
4821 if (ImportSummary)
4822 return applyImport(M);
4823
4824 // TODO: If/when other types of memprof cloning are enabled beyond just for
4825 // hot and cold, we will need to change this to individually control the
4826 // AllocationType passed to addStackNodesForMIB during CCG construction.
4827 // Note that we specifically check this after applying imports above, so that
4828 // the option isn't needed to be passed to distributed ThinLTO backend
4829 // clang processes, which won't necessarily have visibility into the linker
4830 // dependences. Instead the information is communicated from the LTO link to
4831 // the backends via the combined summary index.
4832 if (!SupportsHotColdNew)
4833 return false;
4834
4835 ModuleCallsiteContextGraph CCG(M, OREGetter);
4836 return CCG.process();
4837}
4838
4840 const ModuleSummaryIndex *Summary, bool isSamplePGO)
4841 : ImportSummary(Summary), isSamplePGO(isSamplePGO) {
4842 if (ImportSummary) {
4843 // The MemProfImportSummary should only be used for testing ThinLTO
4844 // distributed backend handling via opt, in which case we don't have a
4845 // summary from the pass pipeline.
4847 return;
4848 }
4849 if (MemProfImportSummary.empty())
4850 return;
4851
4852 auto ReadSummaryFile =
4854 if (!ReadSummaryFile) {
4855 logAllUnhandledErrors(ReadSummaryFile.takeError(), errs(),
4856 "Error loading file '" + MemProfImportSummary +
4857 "': ");
4858 return;
4859 }
4860 auto ImportSummaryForTestingOrErr = getModuleSummaryIndex(**ReadSummaryFile);
4861 if (!ImportSummaryForTestingOrErr) {
4862 logAllUnhandledErrors(ImportSummaryForTestingOrErr.takeError(), errs(),
4863 "Error parsing file '" + MemProfImportSummary +
4864 "': ");
4865 return;
4866 }
4867 ImportSummaryForTesting = std::move(*ImportSummaryForTestingOrErr);
4868 ImportSummary = ImportSummaryForTesting.get();
4869}
4870
4873 auto &FAM = AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
4874 auto OREGetter = [&](Function *F) -> OptimizationRemarkEmitter & {
4876 };
4877 if (!processModule(M, OREGetter))
4878 return PreservedAnalyses::all();
4879 return PreservedAnalyses::none();
4880}
4881
4885 isPrevailing) {
4886 // TODO: If/when other types of memprof cloning are enabled beyond just for
4887 // hot and cold, we will need to change this to individually control the
4888 // AllocationType passed to addStackNodesForMIB during CCG construction.
4889 // The index was set from the option, so these should be in sync.
4890 assert(Index.withSupportsHotColdNew() == SupportsHotColdNew);
4891 if (!SupportsHotColdNew)
4892 return;
4893
4894 IndexCallsiteContextGraph CCG(Index, isPrevailing);
4895 CCG.process();
4896}
aarch64 promote const
static msgpack::DocNode getNode(msgpack::DocNode DN, msgpack::Type Type, MCValue Val)
static void print(raw_ostream &Out, object::Archive::Kind Kind, T Val)
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
#define LLVM_ATTRIBUTE_UNUSED
Definition: Compiler.h:282
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
#define LLVM_DEBUG(...)
Definition: Debug.h:106
This file defines the DenseMap class.
This file defines the DenseSet and SmallDenseSet classes.
std::string Name
uint32_t Index
#define DEBUG_TYPE
global merge func
Module.h This file contains the declarations for the Module class.
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
#define G(x, y, z)
Definition: MD5.cpp:56
This file implements a map that provides insertion order iteration.
static SmallVector< std::unique_ptr< ValueToValueMapTy >, 4 > createFunctionClones(Function &F, unsigned NumClones, Module &M, OptimizationRemarkEmitter &ORE, std::map< const Function *, SmallPtrSet< const GlobalAlias *, 1 > > &FuncToAliasMap)
static cl::opt< unsigned > TailCallSearchDepth("memprof-tail-call-search-depth", cl::init(5), cl::Hidden, cl::desc("Max depth to recursively search for missing " "frames through tail calls."))
static bool isMemProfClone(const Function &F)
static cl::opt< bool > ExportToDot("memprof-export-to-dot", cl::init(false), cl::Hidden, cl::desc("Export graph to dot files."))
static void checkEdge(const std::shared_ptr< ContextEdge< DerivedCCG, FuncTy, CallTy > > &Edge)
bool checkColdOrNotCold(uint8_t AllocType)
static ValueInfo findValueInfoForFunc(const Function &F, const Module &M, const ModuleSummaryIndex *ImportSummary)
cl::opt< unsigned > MinClonedColdBytePercent
static std::string getAllocTypeString(uint8_t AllocTypes)
cl::opt< bool > MemProfReportHintedSizes
static cl::opt< bool > VerifyCCG("memprof-verify-ccg", cl::init(false), cl::Hidden, cl::desc("Perform verification checks on CallingContextGraph."))
static void checkNode(const ContextNode< DerivedCCG, FuncTy, CallTy > *Node, bool CheckEdges=true)
static std::string getMemProfFuncName(Twine Base, unsigned CloneNo)
static cl::opt< std::string > MemProfImportSummary("memprof-import-summary", cl::desc("Import summary to use for testing the ThinLTO backend via opt"), cl::Hidden)
#define DEBUG_TYPE
static const std::string MemProfCloneSuffix
static cl::opt< std::string > DotFilePathPrefix("memprof-dot-file-path-prefix", cl::init(""), cl::Hidden, cl::value_desc("filename"), cl::desc("Specify the path prefix of the MemProf dot files."))
static cl::opt< bool > VerifyNodes("memprof-verify-nodes", cl::init(false), cl::Hidden, cl::desc("Perform frequent verification checks on nodes."))
static cl::opt< bool > DumpCCG("memprof-dump-ccg", cl::init(false), cl::Hidden, cl::desc("Dump CallingContextGraph to stdout after each stage."))
AllocType
cl::opt< bool > MemProfReportHintedSizes("memprof-report-hinted-sizes", cl::init(false), cl::Hidden, cl::desc("Report total allocation sizes of hinted allocations"))
This is the interface to build a ModuleSummaryIndex for a module.
ModuleSummaryIndex.h This file contains the declarations the classes that hold the module index and s...
#define P(N)
if(auto Err=PB.parsePassPipeline(MPM, Passes)) return wrap(std MPM run * Mod
FunctionAnalysisManager FAM
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
raw_pwrite_stream & OS
This file defines generic set operations that may be used on set's of different types,...
This file defines the SmallPtrSet class.
This file defines the SmallSet class.
This file defines the SmallVector class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition: Statistic.h:166
void print(OutputBuffer &OB) const
DomTreeNode::const_iterator end() const
Definition: LoopUnroll.cpp:249
Alias summary information.
ValueInfo getAliaseeVI() const
A container for analyses that lazily runs them and caches their results.
Definition: PassManager.h:253
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
Definition: PassManager.h:410
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
iterator end() const
Definition: ArrayRef.h:157
iterator begin() const
Definition: ArrayRef.h:156
bool empty() const
empty - Check if the array is empty.
Definition: ArrayRef.h:163
ArrayRef< T > slice(size_t N, size_t M) const
slice(n, m) - Chop off the first N elements of the array, and keep M elements in the array.
Definition: ArrayRef.h:198
static Attribute get(LLVMContext &Context, AttrKind Kind, uint64_t Val=0)
Return a uniquified Attribute object.
Definition: Attributes.cpp:95
Base class for all callable instructions (InvokeInst and CallInst) Holds everything related to callin...
Definition: InstrTypes.h:1120
void addFnAttr(Attribute::AttrKind Kind)
Adds the attribute to the function.
Definition: InstrTypes.h:1482
void setCalledFunction(Function *Fn)
Sets the function called, including updating the function type.
Definition: InstrTypes.h:1388
iterator find(const_arg_type_t< KeyT > Val)
Definition: DenseMap.h:156
unsigned size() const
Definition: DenseMap.h:99
bool empty() const
Definition: DenseMap.h:98
size_type count(const_arg_type_t< KeyT > Val) const
Return 1 if the specified key is in the map, 0 otherwise.
Definition: DenseMap.h:152
iterator end()
Definition: DenseMap.h:84
bool contains(const_arg_type_t< KeyT > Val) const
Return true if the specified key is in the map, false otherwise.
Definition: DenseMap.h:147
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition: DenseMap.h:211
void reserve(size_type NumEntries)
Grow the densemap so that it can contain at least NumEntries items before resizing again.
Definition: DenseMap.h:103
Implements a dense probed hash-table based set.
Definition: DenseSet.h:278
Lightweight error class with error context and mandatory checking.
Definition: Error.h:160
Function summary information to aid decisions and implementation of importing.
FunctionType * getFunctionType() const
Returns the FunctionType for me.
Definition: Function.h:216
static GlobalAlias * create(Type *Ty, unsigned AddressSpace, LinkageTypes Linkage, const Twine &Name, Constant *Aliasee, Module *Parent)
If a parent module is specified, the alias is automatically inserted into the end of the specified mo...
Definition: Globals.cpp:557
Function and variable summary information to aid decisions and implementation of importing.
static bool isLocalLinkage(LinkageTypes Linkage)
Definition: GlobalValue.h:409
bool isDeclaration() const
Return true if the primary definition of this global value is outside of the current translation unit...
Definition: Globals.cpp:296
GUID getGUID() const
Return a 64-bit global unique ID constructed from global value name (i.e.
Definition: GlobalValue.h:595
std::string getGlobalIdentifier() const
Return the modified name for this global value suitable to be used as the key for a global lookup (e....
Definition: Globals.cpp:184
@ InternalLinkage
Rename collisions when linking (static functions).
Definition: GlobalValue.h:59
An analysis over an "outer" IR unit that provides access to an analysis manager over an "inner" IR un...
Definition: PassManager.h:567
const Function * getFunction() const
Return the function this instruction belongs to.
Definition: Instruction.cpp:72
void setMetadata(unsigned KindID, MDNode *Node)
Set the metadata of the specified kind to the specified node.
Definition: Metadata.cpp:1679
Metadata node.
Definition: Metadata.h:1069
const MDOperand & getOperand(unsigned I) const
Definition: Metadata.h:1430
unsigned getNumOperands() const
Return number of MDNode operands.
Definition: Metadata.h:1436
This class implements a map that also provides access to all stored values in a deterministic order.
Definition: MapVector.h:36
size_type count(const KeyT &Key) const
Definition: MapVector.h:165
MemProfContextDisambiguation(const ModuleSummaryIndex *Summary=nullptr, bool isSamplePGO=false)
PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM)
static ErrorOr< std::unique_ptr< MemoryBuffer > > getFile(const Twine &Filename, bool IsText=false, bool RequiresNullTerminator=true, bool IsVolatile=false, std::optional< Align > Alignment=std::nullopt)
Open the specified file as a MemoryBuffer, returning a new MemoryBuffer if successful,...
Class to hold module path string table and global value map, and encapsulate methods for operating on...
static StringRef getOriginalNameBeforePromote(StringRef Name)
Helper to obtain the unpromoted name for a global value (or the original name if not promoted).
ValueInfo getValueInfo(const GlobalValueSummaryMapTy::value_type &R) const
Return a ValueInfo for the index value_type (convenient when iterating index).
uint64_t getStackIdAtIndex(unsigned Index) const
GlobalValueSummary * findSummaryInModule(ValueInfo VI, StringRef ModuleId) const
Find the summary for ValueInfo VI in module ModuleId, or nullptr if not found.
GlobalValue::GUID getGUIDFromOriginalID(GlobalValue::GUID OriginalID) const
Return the GUID for OriginalId in the OidGuidMap.
A Module instance is used to store all the information related to an LLVM module.
Definition: Module.h:65
A NodeSet contains a set of SUnit DAG nodes with additional information that assigns a priority to th...
unsigned size() const
bool insert(SUnit *SU)
The optimization diagnostic interface.
void emit(DiagnosticInfoOptimizationBase &OptDiag)
Output the remark via the diagnostic handler and to the optimization record file.
Diagnostic information for missed-optimization remarks.
Diagnostic information for applied optimization remarks.
A discriminated union of two or more pointer types, with the discriminator in the low bit of the poin...
Definition: PointerUnion.h:118
A set of analyses that are preserved following a run of a transformation pass.
Definition: Analysis.h:111
static PreservedAnalyses none()
Convenience factory function for the empty preserved set.
Definition: Analysis.h:114
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition: Analysis.h:117
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
Definition: SmallPtrSet.h:519
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
Definition: SmallSet.h:132
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
Definition: SmallSet.h:181
bool empty() const
Definition: SmallVector.h:81
size_t size() const
Definition: SmallVector.h:78
reference emplace_back(ArgTypes &&... Args)
Definition: SmallVector.h:937
void reserve(size_type N)
Definition: SmallVector.h:663
void push_back(const T &Elt)
Definition: SmallVector.h:413
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1196
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:51
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition: Twine.h:81
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
LLVM Value Representation.
Definition: Value.h:74
StringRef getName() const
Return a constant reference to the value's name.
Definition: Value.cpp:309
std::pair< iterator, bool > insert(const ValueT &V)
Definition: DenseSet.h:213
void reserve(size_t Size)
Grow the DenseSet so that it can contain at least NumEntries items before resizing again.
Definition: DenseSet.h:90
size_type size() const
Definition: DenseSet.h:81
void swap(DenseSetImpl &RHS)
Definition: DenseSet.h:99
bool contains(const_arg_type_t< ValueT > V) const
Check if the set contains the given element.
Definition: DenseSet.h:193
size_type count(const_arg_type_t< ValueT > V) const
Return 1 if the specified key is in the set, 0 otherwise.
Definition: DenseSet.h:95
An efficient, type-erasing, non-owning reference to a callable.
Helper class to iterate through stack ids in both metadata (memprof MIB and callsite) and the corresp...
CallStackIterator end() const
CallStackIterator beginAfterSharedPrefix(CallStack &Other)
This class implements an extremely fast bulk output stream that can only output to a stream.
Definition: raw_ostream.h:52
@ Entry
Definition: COFF.h:844
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
@ CE
Windows NT (Windows on ARM)
@ FS
Definition: X86.h:211
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:443
AllocationType getMIBAllocType(const MDNode *MIB)
Returns the allocation type from an MIB metadata node.
bool hasSingleAllocType(uint8_t AllocTypes)
True if the AllocTypes bitmask contains just a single type.
std::string getAllocTypeAttributeString(AllocationType Type)
Returns the string to use in attributes with the given type.
MDNode * getMIBStackNode(const MDNode *MIB)
Returns the stack node from an MIB metadata node.
DiagnosticInfoOptimizationBase::Argument NV
CallBase & promoteIndirectCall(CallBase &CB, Function *F, uint64_t Count, uint64_t TotalCount, bool AttachProfToDirectCall, OptimizationRemarkEmitter *ORE)
NodeAddr< FuncNode * > Func
Definition: RDFGraph.h:393
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition: STLExtras.h:329
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
void logAllUnhandledErrors(Error E, raw_ostream &OS, Twine ErrorBanner={})
Log all errors (if any) in E to OS.
Definition: Error.cpp:65
bool isLegalToPromote(const CallBase &CB, Function *Callee, const char **FailureReason=nullptr)
Return true if the given indirect call site can be made to call Callee.
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition: STLExtras.h:2448
void set_intersect(S1Ty &S1, const S2Ty &S2)
set_intersect(A, B) - Compute A := A ^ B Identical to set_intersection, except that it works on set<>...
Definition: SetOperations.h:58
cl::opt< bool > MemProfRequireDefinitionForPromotion("memprof-require-definition-for-promotion", cl::init(false), cl::Hidden, cl::desc("Require target function definition when promoting indirect calls"))
bool set_is_subset(const S1Ty &S1, const S2Ty &S2)
set_is_subset(A, B) - Return true iff A in B
void set_subtract(S1Ty &S1, const S2Ty &S2)
set_subtract(A, B) - Compute A := A - B
raw_ostream & WriteGraph(raw_ostream &O, const GraphType &G, bool ShortNames=false, const Twine &Title="")
Definition: GraphWriter.h:359
Printable print(const GCNRegPressure &RP, const GCNSubtarget *ST=nullptr)
Expected< std::unique_ptr< ModuleSummaryIndex > > getModuleSummaryIndex(MemoryBufferRef Buffer)
Parse the specified bitcode buffer, returning the module summary index.
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1746
void annotateValueSite(Module &M, Instruction &Inst, const InstrProfRecord &InstrProfR, InstrProfValueKind ValueKind, uint32_t SiteIndx, uint32_t MaxMDCount=3)
Get the value profile data for value site SiteIdx from InstrProfR and annotate the instruction Inst w...
Definition: InstrProf.cpp:1301
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1753
bool set_union(S1Ty &S1, const S2Ty &S2)
set_union(A, B) - Compute A := A u B, return whether A changed.
Definition: SetOperations.h:43
cl::opt< bool > SupportsHotColdNew
Indicate we are linking with an allocator that supports hot/cold operator new interfaces.
S1Ty set_intersection(const S1Ty &S1, const S2Ty &S2)
set_intersection(A, B) - Return A ^ B
Definition: SetOperations.h:83
raw_fd_ostream & errs()
This returns a reference to a raw_ostream for standard error.
cl::opt< bool > EnableMemProfContextDisambiguation
Enable MemProf context disambiguation for thin link.
raw_ostream & operator<<(raw_ostream &OS, const APFixedPoint &FX)
Definition: APFixedPoint.h:303
Expected< T > errorOrToExpected(ErrorOr< T > &&EO)
Convert an ErrorOr<T> to an Expected<T>.
Definition: Error.h:1231
OutputIt move(R &&Range, OutputIt Out)
Provide wrappers to std::move which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1873
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1766
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition: STLExtras.h:1903
const char * toString(DWARFSectionKind Kind)
Function * CloneFunction(Function *F, ValueToValueMapTy &VMap, ClonedCodeInfo *CodeInfo=nullptr)
Return a copy of the specified function and add it to that function's module.
bool mayHaveMemprofSummary(const CallBase *CB)
Returns true if the instruction could have memprof metadata, used to ensure consistency between summa...
Implement std::hash so that hash_code can be used in STL containers.
Definition: BitVector.h:858
#define N
static std::string getEdgeAttributes(NodeRef, ChildIteratorType ChildIter, GraphType)
static const ContextNode< DerivedCCG, FuncTy, CallTy > * GetCallee(const EdgePtrTy &P)
std::shared_ptr< ContextEdge< DerivedCCG, FuncTy, CallTy > > EdgePtrTy
std::unique_ptr< ContextNode< DerivedCCG, FuncTy, CallTy > > NodePtrTy
Summary of memprof metadata on allocations.
SmallVector< uint8_t > Versions
Summary of memprof callsite metadata.
SmallVector< unsigned > Clones
DOTGraphTraits - Template class that can be specialized to customize how graphs are converted to 'dot...
DefaultDOTGraphTraits - This class provides the default implementations of all of the DOTGraphTraits ...
An information struct used to provide DenseMap with the various necessary components for a given valu...
Definition: DenseMapInfo.h:52
Used in the streaming interface as the general argument type.
typename GraphType::UnknownGraphTypeError NodeRef
Definition: GraphTraits.h:95
Struct that holds a reference to a particular GUID in a global value summary.
ArrayRef< std::unique_ptr< GlobalValueSummary > > getSummaryList() const
GlobalValue::GUID getGUID() const