Bug Summary

File:build/source/bolt/runtime/instr.cpp
Warning:line 1183, column 20
Value stored to 'Parent' during its initialization is never read

Annotated Source Code

Press '?' to see keyboard shortcuts

clang -cc1 -cc1 -triple x86_64-pc-linux-gnu -analyze -disable-free -clear-ast-before-backend -disable-llvm-verifier -discard-value-names -main-file-name instr.cpp -analyzer-checker=core -analyzer-checker=apiModeling -analyzer-checker=unix -analyzer-checker=deadcode -analyzer-checker=cplusplus -analyzer-checker=security.insecureAPI.UncheckedReturn -analyzer-checker=security.insecureAPI.getpw -analyzer-checker=security.insecureAPI.gets -analyzer-checker=security.insecureAPI.mktemp -analyzer-checker=security.insecureAPI.mkstemp -analyzer-checker=security.insecureAPI.vfork -analyzer-checker=nullability.NullPassedToNonnull -analyzer-checker=nullability.NullReturnedFromNonnull -analyzer-output plist -w -setup-static-analyzer -analyzer-config-compatibility-mode=true -mrelocation-model pic -pic-level 2 -fhalf-no-semantic-interposition -mframe-pointer=none -fmath-errno -ffp-contract=on -fno-rounding-math -mconstructor-aliases -ffreestanding -target-cpu x86-64 -target-feature -sse -tune-cpu generic -debugger-tuning=gdb -fcoverage-compilation-dir=/build/source/build-llvm/tools/clang/stage2-bins/tools/bolt/bolt_rt-bins -resource-dir /usr/lib/llvm-17/lib/clang/17 -I . -D NDEBUG -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../include/c++/10 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../include/x86_64-linux-gnu/c++/10 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../include/c++/10/backward -internal-isystem /usr/lib/llvm-17/lib/clang/17/include -internal-isystem /usr/local/include -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../x86_64-linux-gnu/include -internal-externc-isystem /usr/include/x86_64-linux-gnu -internal-externc-isystem /include -internal-externc-isystem /usr/include -source-date-epoch 1685830157 -O3 -std=c++17 -fdeprecated-macro -fdebug-compilation-dir=/build/source/build-llvm/tools/clang/stage2-bins/tools/bolt/bolt_rt-bins -ferror-limit 19 -fno-rtti -fgnuc-version=4.2.1 -vectorize-loops -vectorize-slp -analyzer-output=html -analyzer-config stable-report-filename=true -faddrsig -o /tmp/scan-build-2023-06-03-235318-65937-1 -x c++ /build/source/bolt/runtime/instr.cpp
1//===- bolt/runtime/instr.cpp ---------------------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// BOLT runtime instrumentation library for x86 Linux. Currently, BOLT does
10// not support linking modules with dependencies on one another into the final
11// binary (TODO?), which means this library has to be self-contained in a single
12// module.
13//
14// All extern declarations here need to be defined by BOLT itself. Those will be
15// undefined symbols that BOLT needs to resolve by emitting these symbols with
16// MCStreamer. Currently, Passes/Instrumentation.cpp is the pass responsible
17// for defining the symbols here and these two files have a tight coupling: one
18// working statically when you run BOLT and another during program runtime when
19// you run an instrumented binary. The main goal here is to output an fdata file
20// (BOLT profile) with the instrumentation counters inserted by the static pass.
21// Counters for indirect calls are an exception, as we can't know them
22// statically. These counters are created and managed here. To allow this, we
23// need a minimal framework for allocating memory dynamically. We provide this
24// with the BumpPtrAllocator class (not LLVM's, but our own version of it).
25//
26// Since this code is intended to be inserted into any executable, we decided to
27// make it standalone and do not depend on any external libraries (i.e. language
28// support libraries, such as glibc or stdc++). To allow this, we provide a few
29// light implementations of common OS interacting functionalities using direct
30// syscall wrappers. Our simple allocator doesn't manage deallocations that
31// fragment the memory space, so it's stack based. This is the minimal framework
32// provided here to allow processing instrumented counters and writing fdata.
33//
34// In the C++ idiom used here, we never use or rely on constructors or
35// destructors for global objects. That's because those need support from the
36// linker in initialization/finalization code, and we want to keep our linker
37// very simple. Similarly, we don't create any global objects that are zero
38// initialized, since those would need to go .bss, which our simple linker also
39// don't support (TODO?).
40//
41//===----------------------------------------------------------------------===//
42
43#if defined (__x86_64__1)
44#include "common.h"
45
46// Enables a very verbose logging to stderr useful when debugging
47//#define ENABLE_DEBUG
48
49#ifdef ENABLE_DEBUG
50#define DEBUG(X){} \
51 { X; }
52#else
53#define DEBUG(X){} \
54 {}
55#endif
56
57#pragma GCC visibility push(hidden)
58
59extern "C" {
60
61#if defined(__APPLE__)
62extern uint64_t* _bolt_instr_locations_getter();
63extern uint32_t _bolt_num_counters_getter();
64
65extern uint8_t* _bolt_instr_tables_getter();
66extern uint32_t _bolt_instr_num_funcs_getter();
67
68#else
69
70// Main counters inserted by instrumentation, incremented during runtime when
71// points of interest (locations) in the program are reached. Those are direct
72// calls and direct and indirect branches (local ones). There are also counters
73// for basic block execution if they are a spanning tree leaf and need to be
74// counted in order to infer the execution count of other edges of the CFG.
75extern uint64_t __bolt_instr_locations[];
76extern uint32_t __bolt_num_counters;
77// Descriptions are serialized metadata about binary functions written by BOLT,
78// so we have a minimal understanding about the program structure. For a
79// reference on the exact format of this metadata, see *Description structs,
80// Location, IntrumentedNode and EntryNode.
81// Number of indirect call site descriptions
82extern uint32_t __bolt_instr_num_ind_calls;
83// Number of indirect call target descriptions
84extern uint32_t __bolt_instr_num_ind_targets;
85// Number of function descriptions
86extern uint32_t __bolt_instr_num_funcs;
87// Time to sleep across dumps (when we write the fdata profile to disk)
88extern uint32_t __bolt_instr_sleep_time;
89// Do not clear counters across dumps, rewrite file with the updated values
90extern bool __bolt_instr_no_counters_clear;
91// Wait until all forks of instrumented process will finish
92extern bool __bolt_instr_wait_forks;
93// Filename to dump data to
94extern char __bolt_instr_filename[];
95// Instumented binary file path
96extern char __bolt_instr_binpath[];
97// If true, append current PID to the fdata filename when creating it so
98// different invocations of the same program can be differentiated.
99extern bool __bolt_instr_use_pid;
100// Functions that will be used to instrument indirect calls. BOLT static pass
101// will identify indirect calls and modify them to load the address in these
102// trampolines and call this address instead. BOLT can't use direct calls to
103// our handlers because our addresses here are not known at analysis time. We
104// only support resolving dependencies from this file to the output of BOLT,
105// *not* the other way around.
106// TODO: We need better linking support to make that happen.
107extern void (*__bolt_ind_call_counter_func_pointer)();
108extern void (*__bolt_ind_tailcall_counter_func_pointer)();
109// Function pointers to init/fini trampoline routines in the binary, so we can
110// resume regular execution of these functions that we hooked
111extern void __bolt_start_trampoline();
112extern void __bolt_fini_trampoline();
113
114#endif
115}
116
117namespace {
118
119/// A simple allocator that mmaps a fixed size region and manages this space
120/// in a stack fashion, meaning you always deallocate the last element that
121/// was allocated. In practice, we don't need to deallocate individual elements.
122/// We monotonically increase our usage and then deallocate everything once we
123/// are done processing something.
124class BumpPtrAllocator {
125 /// This is written before each allocation and act as a canary to detect when
126 /// a bug caused our program to cross allocation boundaries.
127 struct EntryMetadata {
128 uint64_t Magic;
129 uint64_t AllocSize;
130 };
131
132public:
133 void *allocate(size_t Size) {
134 Lock L(M);
135
136 if (StackBase == nullptr) {
137#if defined(__APPLE__)
138 int MAP_PRIVATE_MAP_ANONYMOUS = 0x1002;
139#else
140 int MAP_PRIVATE_MAP_ANONYMOUS = 0x22;
141#endif
142 StackBase = reinterpret_cast<uint8_t *>(
143 __mmap(0, MaxSize, 0x3 /* PROT_READ | PROT_WRITE*/,
144 Shared ? 0x21 /*MAP_SHARED | MAP_ANONYMOUS*/
145 : MAP_PRIVATE_MAP_ANONYMOUS /* MAP_PRIVATE | MAP_ANONYMOUS*/,
146 -1, 0));
147 StackSize = 0;
148 }
149
150 Size = alignTo(Size + sizeof(EntryMetadata), 16);
151 uint8_t *AllocAddress = StackBase + StackSize + sizeof(EntryMetadata);
152 auto *M = reinterpret_cast<EntryMetadata *>(StackBase + StackSize);
153 M->Magic = Magic;
154 M->AllocSize = Size;
155 StackSize += Size;
156 assert(StackSize < MaxSize, "allocator ran out of memory");
157 return AllocAddress;
158 }
159
160#ifdef DEBUG
161 /// Element-wise deallocation is only used for debugging to catch memory
162 /// bugs by checking magic bytes. Ordinarily, we reset the allocator once
163 /// we are done with it. Reset is done with clear(). There's no need
164 /// to deallocate each element individually.
165 void deallocate(void *Ptr) {
166 Lock L(M);
167 uint8_t MetadataOffset = sizeof(EntryMetadata);
168 auto *M = reinterpret_cast<EntryMetadata *>(
169 reinterpret_cast<uint8_t *>(Ptr) - MetadataOffset);
170 const uint8_t *StackTop = StackBase + StackSize + MetadataOffset;
171 // Validate size
172 if (Ptr != StackTop - M->AllocSize) {
173 // Failed validation, check if it is a pointer returned by operator new []
174 MetadataOffset +=
175 sizeof(uint64_t); // Space for number of elements alloc'ed
176 M = reinterpret_cast<EntryMetadata *>(reinterpret_cast<uint8_t *>(Ptr) -
177 MetadataOffset);
178 // Ok, it failed both checks if this assertion fails. Stop the program, we
179 // have a memory bug.
180 assert(Ptr == StackTop - M->AllocSize,
181 "must deallocate the last element alloc'ed");
182 }
183 assert(M->Magic == Magic, "allocator magic is corrupt");
184 StackSize -= M->AllocSize;
185 }
186#else
187 void deallocate(void *) {}
188#endif
189
190 void clear() {
191 Lock L(M);
192 StackSize = 0;
193 }
194
195 /// Set mmap reservation size (only relevant before first allocation)
196 void setMaxSize(uint64_t Size) { MaxSize = Size; }
197
198 /// Set mmap reservation privacy (only relevant before first allocation)
199 void setShared(bool S) { Shared = S; }
200
201 void destroy() {
202 if (StackBase == nullptr)
203 return;
204 __munmap(StackBase, MaxSize);
205 }
206
207private:
208 static constexpr uint64_t Magic = 0x1122334455667788ull;
209 uint64_t MaxSize = 0xa00000;
210 uint8_t *StackBase{nullptr};
211 uint64_t StackSize{0};
212 bool Shared{false};
213 Mutex M;
214};
215
216/// Used for allocating indirect call instrumentation counters. Initialized by
217/// __bolt_instr_setup, our initialization routine.
218BumpPtrAllocator GlobalAlloc;
219} // anonymous namespace
220
221// User-defined placement new operators. We only use those (as opposed to
222// overriding the regular operator new) so we can keep our allocator in the
223// stack instead of in a data section (global).
224void *operator new(size_t Sz, BumpPtrAllocator &A) { return A.allocate(Sz); }
225void *operator new(size_t Sz, BumpPtrAllocator &A, char C) {
226 auto *Ptr = reinterpret_cast<char *>(A.allocate(Sz));
227 memset(Ptr, C, Sz);
228 return Ptr;
229}
230void *operator new[](size_t Sz, BumpPtrAllocator &A) {
231 return A.allocate(Sz);
232}
233void *operator new[](size_t Sz, BumpPtrAllocator &A, char C) {
234 auto *Ptr = reinterpret_cast<char *>(A.allocate(Sz));
235 memset(Ptr, C, Sz);
236 return Ptr;
237}
238// Only called during exception unwinding (useless). We must manually dealloc.
239// C++ language weirdness
240void operator delete(void *Ptr, BumpPtrAllocator &A) { A.deallocate(Ptr); }
241
242namespace {
243
244// Disable instrumentation optimizations that sacrifice profile accuracy
245extern "C" bool __bolt_instr_conservative;
246
247/// Basic key-val atom stored in our hash
248struct SimpleHashTableEntryBase {
249 uint64_t Key;
250 uint64_t Val;
251};
252
253/// This hash table implementation starts by allocating a table of size
254/// InitialSize. When conflicts happen in this main table, it resolves
255/// them by chaining a new table of size IncSize. It never reallocs as our
256/// allocator doesn't support it. The key is intended to be function pointers.
257/// There's no clever hash function (it's just x mod size, size being prime).
258/// I never tuned the coefficientes in the modular equation (TODO)
259/// This is used for indirect calls (each call site has one of this, so it
260/// should have a small footprint) and for tallying call counts globally for
261/// each target to check if we missed the origin of some calls (this one is a
262/// large instantiation of this template, since it is global for all call sites)
263template <typename T = SimpleHashTableEntryBase, uint32_t InitialSize = 7,
264 uint32_t IncSize = 7>
265class SimpleHashTable {
266public:
267 using MapEntry = T;
268
269 /// Increment by 1 the value of \p Key. If it is not in this table, it will be
270 /// added to the table and its value set to 1.
271 void incrementVal(uint64_t Key, BumpPtrAllocator &Alloc) {
272 ++get(Key, Alloc).Val;
273 }
274
275 /// Basic member accessing interface. Here we pass the allocator explicitly to
276 /// avoid storing a pointer to it as part of this table (remember there is one
277 /// hash for each indirect call site, so we wan't to minimize our footprint).
278 MapEntry &get(uint64_t Key, BumpPtrAllocator &Alloc) {
279 if (!__bolt_instr_conservative) {
280 TryLock L(M);
281 if (!L.isLocked())
282 return NoEntry;
283 return getOrAllocEntry(Key, Alloc);
284 }
285 Lock L(M);
286 return getOrAllocEntry(Key, Alloc);
287 }
288
289 /// Traverses all elements in the table
290 template <typename... Args>
291 void forEachElement(void (*Callback)(MapEntry &, Args...), Args... args) {
292 Lock L(M);
293 if (!TableRoot)
294 return;
295 return forEachElement(Callback, InitialSize, TableRoot, args...);
296 }
297
298 void resetCounters();
299
300private:
301 constexpr static uint64_t VacantMarker = 0;
302 constexpr static uint64_t FollowUpTableMarker = 0x8000000000000000ull;
303
304 MapEntry *TableRoot{nullptr};
305 MapEntry NoEntry;
306 Mutex M;
307
308 template <typename... Args>
309 void forEachElement(void (*Callback)(MapEntry &, Args...),
310 uint32_t NumEntries, MapEntry *Entries, Args... args) {
311 for (uint32_t I = 0; I < NumEntries; ++I) {
312 MapEntry &Entry = Entries[I];
313 if (Entry.Key == VacantMarker)
314 continue;
315 if (Entry.Key & FollowUpTableMarker) {
316 forEachElement(Callback, IncSize,
317 reinterpret_cast<MapEntry *>(Entry.Key &
318 ~FollowUpTableMarker),
319 args...);
320 continue;
321 }
322 Callback(Entry, args...);
323 }
324 }
325
326 MapEntry &firstAllocation(uint64_t Key, BumpPtrAllocator &Alloc) {
327 TableRoot = new (Alloc, 0) MapEntry[InitialSize];
328 MapEntry &Entry = TableRoot[Key % InitialSize];
329 Entry.Key = Key;
330 return Entry;
331 }
332
333 MapEntry &getEntry(MapEntry *Entries, uint64_t Key, uint64_t Selector,
334 BumpPtrAllocator &Alloc, int CurLevel) {
335 const uint32_t NumEntries = CurLevel == 0 ? InitialSize : IncSize;
336 uint64_t Remainder = Selector / NumEntries;
337 Selector = Selector % NumEntries;
338 MapEntry &Entry = Entries[Selector];
339
340 // A hit
341 if (Entry.Key == Key) {
342 return Entry;
343 }
344
345 // Vacant - add new entry
346 if (Entry.Key == VacantMarker) {
347 Entry.Key = Key;
348 return Entry;
349 }
350
351 // Defer to the next level
352 if (Entry.Key & FollowUpTableMarker) {
353 return getEntry(
354 reinterpret_cast<MapEntry *>(Entry.Key & ~FollowUpTableMarker),
355 Key, Remainder, Alloc, CurLevel + 1);
356 }
357
358 // Conflict - create the next level
359 MapEntry *NextLevelTbl = new (Alloc, 0) MapEntry[IncSize];
360 uint64_t CurEntrySelector = Entry.Key / InitialSize;
361 for (int I = 0; I < CurLevel; ++I)
362 CurEntrySelector /= IncSize;
363 CurEntrySelector = CurEntrySelector % IncSize;
364 NextLevelTbl[CurEntrySelector] = Entry;
365 Entry.Key = reinterpret_cast<uint64_t>(NextLevelTbl) | FollowUpTableMarker;
366 return getEntry(NextLevelTbl, Key, Remainder, Alloc, CurLevel + 1);
367 }
368
369 MapEntry &getOrAllocEntry(uint64_t Key, BumpPtrAllocator &Alloc) {
370 if (TableRoot)
371 return getEntry(TableRoot, Key, Key, Alloc, 0);
372 return firstAllocation(Key, Alloc);
373 }
374};
375
376template <typename T> void resetIndCallCounter(T &Entry) {
377 Entry.Val = 0;
378}
379
380template <typename T, uint32_t X, uint32_t Y>
381void SimpleHashTable<T, X, Y>::resetCounters() {
382 forEachElement(resetIndCallCounter);
383}
384
385/// Represents a hash table mapping a function target address to its counter.
386using IndirectCallHashTable = SimpleHashTable<>;
387
388/// Initialize with number 1 instead of 0 so we don't go into .bss. This is the
389/// global array of all hash tables storing indirect call destinations happening
390/// during runtime, one table per call site.
391IndirectCallHashTable *GlobalIndCallCounters{
392 reinterpret_cast<IndirectCallHashTable *>(1)};
393
394/// Don't allow reentrancy in the fdata writing phase - only one thread writes
395/// it
396Mutex *GlobalWriteProfileMutex{reinterpret_cast<Mutex *>(1)};
397
398/// Store number of calls in additional to target address (Key) and frequency
399/// as perceived by the basic block counter (Val).
400struct CallFlowEntryBase : public SimpleHashTableEntryBase {
401 uint64_t Calls;
402};
403
404using CallFlowHashTableBase = SimpleHashTable<CallFlowEntryBase, 11939, 233>;
405
406/// This is a large table indexing all possible call targets (indirect and
407/// direct ones). The goal is to find mismatches between number of calls (for
408/// those calls we were able to track) and the entry basic block counter of the
409/// callee. In most cases, these two should be equal. If not, there are two
410/// possible scenarios here:
411///
412/// * Entry BB has higher frequency than all known calls to this function.
413/// In this case, we have dynamic library code or any uninstrumented code
414/// calling this function. We will write the profile for these untracked
415/// calls as having source "0 [unknown] 0" in the fdata file.
416///
417/// * Number of known calls is higher than the frequency of entry BB
418/// This only happens when there is no counter for the entry BB / callee
419/// function is not simple (in BOLT terms). We don't do anything special
420/// here and just ignore those (we still report all calls to the non-simple
421/// function, though).
422///
423class CallFlowHashTable : public CallFlowHashTableBase {
424public:
425 CallFlowHashTable(BumpPtrAllocator &Alloc) : Alloc(Alloc) {}
426
427 MapEntry &get(uint64_t Key) { return CallFlowHashTableBase::get(Key, Alloc); }
428
429private:
430 // Different than the hash table for indirect call targets, we do store the
431 // allocator here since there is only one call flow hash and space overhead
432 // is negligible.
433 BumpPtrAllocator &Alloc;
434};
435
436///
437/// Description metadata emitted by BOLT to describe the program - refer to
438/// Passes/Instrumentation.cpp - Instrumentation::emitTablesAsELFNote()
439///
440struct Location {
441 uint32_t FunctionName;
442 uint32_t Offset;
443};
444
445struct CallDescription {
446 Location From;
447 uint32_t FromNode;
448 Location To;
449 uint32_t Counter;
450 uint64_t TargetAddress;
451};
452
453using IndCallDescription = Location;
454
455struct IndCallTargetDescription {
456 Location Loc;
457 uint64_t Address;
458};
459
460struct EdgeDescription {
461 Location From;
462 uint32_t FromNode;
463 Location To;
464 uint32_t ToNode;
465 uint32_t Counter;
466};
467
468struct InstrumentedNode {
469 uint32_t Node;
470 uint32_t Counter;
471};
472
473struct EntryNode {
474 uint64_t Node;
475 uint64_t Address;
476};
477
478struct FunctionDescription {
479 uint32_t NumLeafNodes;
480 const InstrumentedNode *LeafNodes;
481 uint32_t NumEdges;
482 const EdgeDescription *Edges;
483 uint32_t NumCalls;
484 const CallDescription *Calls;
485 uint32_t NumEntryNodes;
486 const EntryNode *EntryNodes;
487
488 /// Constructor will parse the serialized function metadata written by BOLT
489 FunctionDescription(const uint8_t *FuncDesc);
490
491 uint64_t getSize() const {
492 return 16 + NumLeafNodes * sizeof(InstrumentedNode) +
493 NumEdges * sizeof(EdgeDescription) +
494 NumCalls * sizeof(CallDescription) +
495 NumEntryNodes * sizeof(EntryNode);
496 }
497};
498
499/// The context is created when the fdata profile needs to be written to disk
500/// and we need to interpret our runtime counters. It contains pointers to the
501/// mmaped binary (only the BOLT written metadata section). Deserialization
502/// should be straightforward as most data is POD or an array of POD elements.
503/// This metadata is used to reconstruct function CFGs.
504struct ProfileWriterContext {
505 IndCallDescription *IndCallDescriptions;
506 IndCallTargetDescription *IndCallTargets;
507 uint8_t *FuncDescriptions;
508 char *Strings; // String table with function names used in this binary
509 int FileDesc; // File descriptor for the file on disk backing this
510 // information in memory via mmap
511 void *MMapPtr; // The mmap ptr
512 int MMapSize; // The mmap size
513
514 /// Hash table storing all possible call destinations to detect untracked
515 /// calls and correctly report them as [unknown] in output fdata.
516 CallFlowHashTable *CallFlowTable;
517
518 /// Lookup the sorted indirect call target vector to fetch function name and
519 /// offset for an arbitrary function pointer.
520 const IndCallTargetDescription *lookupIndCallTarget(uint64_t Target) const;
521};
522
523/// Perform a string comparison and returns zero if Str1 matches Str2. Compares
524/// at most Size characters.
525int compareStr(const char *Str1, const char *Str2, int Size) {
526 while (*Str1 == *Str2) {
527 if (*Str1 == '\0' || --Size == 0)
528 return 0;
529 ++Str1;
530 ++Str2;
531 }
532 return 1;
533}
534
535/// Output Location to the fdata file
536char *serializeLoc(const ProfileWriterContext &Ctx, char *OutBuf,
537 const Location Loc, uint32_t BufSize) {
538 // fdata location format: Type Name Offset
539 // Type 1 - regular symbol
540 OutBuf = strCopy(OutBuf, "1 ");
541 const char *Str = Ctx.Strings + Loc.FunctionName;
542 uint32_t Size = 25;
543 while (*Str) {
544 *OutBuf++ = *Str++;
545 if (++Size >= BufSize)
546 break;
547 }
548 assert(!*Str, "buffer overflow, function name too large");
549 *OutBuf++ = ' ';
550 OutBuf = intToStr(OutBuf, Loc.Offset, 16);
551 *OutBuf++ = ' ';
552 return OutBuf;
553}
554
555/// Read and deserialize a function description written by BOLT. \p FuncDesc
556/// points at the beginning of the function metadata structure in the file.
557/// See Instrumentation::emitTablesAsELFNote()
558FunctionDescription::FunctionDescription(const uint8_t *FuncDesc) {
559 NumLeafNodes = *reinterpret_cast<const uint32_t *>(FuncDesc);
560 DEBUG(reportNumber("NumLeafNodes = ", NumLeafNodes, 10)){};
561 LeafNodes = reinterpret_cast<const InstrumentedNode *>(FuncDesc + 4);
562
563 NumEdges = *reinterpret_cast<const uint32_t *>(
564 FuncDesc + 4 + NumLeafNodes * sizeof(InstrumentedNode));
565 DEBUG(reportNumber("NumEdges = ", NumEdges, 10)){};
566 Edges = reinterpret_cast<const EdgeDescription *>(
567 FuncDesc + 8 + NumLeafNodes * sizeof(InstrumentedNode));
568
569 NumCalls = *reinterpret_cast<const uint32_t *>(
570 FuncDesc + 8 + NumLeafNodes * sizeof(InstrumentedNode) +
571 NumEdges * sizeof(EdgeDescription));
572 DEBUG(reportNumber("NumCalls = ", NumCalls, 10)){};
573 Calls = reinterpret_cast<const CallDescription *>(
574 FuncDesc + 12 + NumLeafNodes * sizeof(InstrumentedNode) +
575 NumEdges * sizeof(EdgeDescription));
576 NumEntryNodes = *reinterpret_cast<const uint32_t *>(
577 FuncDesc + 12 + NumLeafNodes * sizeof(InstrumentedNode) +
578 NumEdges * sizeof(EdgeDescription) + NumCalls * sizeof(CallDescription));
579 DEBUG(reportNumber("NumEntryNodes = ", NumEntryNodes, 10)){};
580 EntryNodes = reinterpret_cast<const EntryNode *>(
581 FuncDesc + 16 + NumLeafNodes * sizeof(InstrumentedNode) +
582 NumEdges * sizeof(EdgeDescription) + NumCalls * sizeof(CallDescription));
583}
584
585/// Read and mmap descriptions written by BOLT from the executable's notes
586/// section
587#if defined(HAVE_ELF_H) and !defined(__APPLE__)
588
589void *__attribute__((noinline)) __get_pc() {
590 return __builtin_extract_return_addr(__builtin_return_address(0));
591}
592
593/// Get string with address and parse it to hex pair <StartAddress, EndAddress>
594bool parseAddressRange(const char *Str, uint64_t &StartAddress,
595 uint64_t &EndAddress) {
596 if (!Str)
597 return false;
598 // Parsed string format: <hex1>-<hex2>
599 StartAddress = hexToLong(Str, '-');
600 while (*Str && *Str != '-')
601 ++Str;
602 if (!*Str)
603 return false;
604 ++Str; // swallow '-'
605 EndAddress = hexToLong(Str);
606 return true;
607}
608
609/// Get full path to the real binary by getting current virtual address
610/// and searching for the appropriate link in address range in
611/// /proc/self/map_files
612static char *getBinaryPath() {
613 const uint32_t BufSize = 1024;
614 const uint32_t NameMax = 4096;
615 const char DirPath[] = "/proc/self/map_files/";
616 static char TargetPath[NameMax] = {};
617 char Buf[BufSize];
618
619 if (__bolt_instr_binpath[0] != '\0')
620 return __bolt_instr_binpath;
621
622 if (TargetPath[0] != '\0')
623 return TargetPath;
624
625 unsigned long CurAddr = (unsigned long)__get_pc();
626 uint64_t FDdir = __open(DirPath,
627 /*flags=*/0 /*O_RDONLY*/,
628 /*mode=*/0666);
629 assert(static_cast<int64_t>(FDdir) >= 0,
630 "failed to open /proc/self/map_files");
631
632 while (long Nread = __getdents(FDdir, (struct dirent *)Buf, BufSize)) {
633 assert(static_cast<int64_t>(Nread) != -1, "failed to get folder entries");
634
635 struct dirent *d;
636 for (long Bpos = 0; Bpos < Nread; Bpos += d->d_reclen) {
637 d = (struct dirent *)(Buf + Bpos);
638
639 uint64_t StartAddress, EndAddress;
640 if (!parseAddressRange(d->d_name, StartAddress, EndAddress))
641 continue;
642 if (CurAddr < StartAddress || CurAddr > EndAddress)
643 continue;
644 char FindBuf[NameMax];
645 char *C = strCopy(FindBuf, DirPath, NameMax);
646 C = strCopy(C, d->d_name, NameMax - (C - FindBuf));
647 *C = '\0';
648 uint32_t Ret = __readlink(FindBuf, TargetPath, sizeof(TargetPath));
649 assert(Ret != -1 && Ret != BufSize, "readlink error");
650 TargetPath[Ret] = '\0';
651 return TargetPath;
652 }
653 }
654 return nullptr;
655}
656
657ProfileWriterContext readDescriptions() {
658 ProfileWriterContext Result;
659 char *BinPath = getBinaryPath();
660 assert(BinPath && BinPath[0] != '\0', "failed to find binary path");
661
662 uint64_t FD = __open(BinPath,
663 /*flags=*/0 /*O_RDONLY*/,
664 /*mode=*/0666);
665 assert(static_cast<int64_t>(FD) >= 0, "failed to open binary path");
666
667 Result.FileDesc = FD;
668
669 // mmap our binary to memory
670 uint64_t Size = __lseek(FD, 0, 2 /*SEEK_END*/);
671 uint8_t *BinContents = reinterpret_cast<uint8_t *>(
672 __mmap(0, Size, 0x1 /* PROT_READ*/, 0x2 /* MAP_PRIVATE*/, FD, 0));
673 Result.MMapPtr = BinContents;
674 Result.MMapSize = Size;
675 Elf64_Ehdr *Hdr = reinterpret_cast<Elf64_Ehdr *>(BinContents);
676 Elf64_Shdr *Shdr = reinterpret_cast<Elf64_Shdr *>(BinContents + Hdr->e_shoff);
677 Elf64_Shdr *StringTblHeader = reinterpret_cast<Elf64_Shdr *>(
678 BinContents + Hdr->e_shoff + Hdr->e_shstrndx * Hdr->e_shentsize);
679
680 // Find .bolt.instr.tables with the data we need and set pointers to it
681 for (int I = 0; I < Hdr->e_shnum; ++I) {
682 char *SecName = reinterpret_cast<char *>(
683 BinContents + StringTblHeader->sh_offset + Shdr->sh_name);
684 if (compareStr(SecName, ".bolt.instr.tables", 64) != 0) {
685 Shdr = reinterpret_cast<Elf64_Shdr *>(BinContents + Hdr->e_shoff +
686 (I + 1) * Hdr->e_shentsize);
687 continue;
688 }
689 // Actual contents of the ELF note start after offset 20 decimal:
690 // Offset 0: Producer name size (4 bytes)
691 // Offset 4: Contents size (4 bytes)
692 // Offset 8: Note type (4 bytes)
693 // Offset 12: Producer name (BOLT\0) (5 bytes + align to 4-byte boundary)
694 // Offset 20: Contents
695 uint32_t IndCallDescSize =
696 *reinterpret_cast<uint32_t *>(BinContents + Shdr->sh_offset + 20);
697 uint32_t IndCallTargetDescSize = *reinterpret_cast<uint32_t *>(
698 BinContents + Shdr->sh_offset + 24 + IndCallDescSize);
699 uint32_t FuncDescSize =
700 *reinterpret_cast<uint32_t *>(BinContents + Shdr->sh_offset + 28 +
701 IndCallDescSize + IndCallTargetDescSize);
702 Result.IndCallDescriptions = reinterpret_cast<IndCallDescription *>(
703 BinContents + Shdr->sh_offset + 24);
704 Result.IndCallTargets = reinterpret_cast<IndCallTargetDescription *>(
705 BinContents + Shdr->sh_offset + 28 + IndCallDescSize);
706 Result.FuncDescriptions = BinContents + Shdr->sh_offset + 32 +
707 IndCallDescSize + IndCallTargetDescSize;
708 Result.Strings = reinterpret_cast<char *>(
709 BinContents + Shdr->sh_offset + 32 + IndCallDescSize +
710 IndCallTargetDescSize + FuncDescSize);
711 return Result;
712 }
713 const char ErrMsg[] =
714 "BOLT instrumentation runtime error: could not find section "
715 ".bolt.instr.tables\n";
716 reportError(ErrMsg, sizeof(ErrMsg));
717 return Result;
718}
719
720#else
721
722ProfileWriterContext readDescriptions() {
723 ProfileWriterContext Result;
724 uint8_t *Tables = _bolt_instr_tables_getter();
725 uint32_t IndCallDescSize = *reinterpret_cast<uint32_t *>(Tables);
726 uint32_t IndCallTargetDescSize =
727 *reinterpret_cast<uint32_t *>(Tables + 4 + IndCallDescSize);
728 uint32_t FuncDescSize = *reinterpret_cast<uint32_t *>(
729 Tables + 8 + IndCallDescSize + IndCallTargetDescSize);
730 Result.IndCallDescriptions =
731 reinterpret_cast<IndCallDescription *>(Tables + 4);
732 Result.IndCallTargets = reinterpret_cast<IndCallTargetDescription *>(
733 Tables + 8 + IndCallDescSize);
734 Result.FuncDescriptions =
735 Tables + 12 + IndCallDescSize + IndCallTargetDescSize;
736 Result.Strings = reinterpret_cast<char *>(
737 Tables + 12 + IndCallDescSize + IndCallTargetDescSize + FuncDescSize);
738 return Result;
739}
740
741#endif
742
743#if !defined(__APPLE__)
744/// Debug by printing overall metadata global numbers to check it is sane
745void printStats(const ProfileWriterContext &Ctx) {
746 char StatMsg[BufSize];
747 char *StatPtr = StatMsg;
748 StatPtr =
749 strCopy(StatPtr,
750 "\nBOLT INSTRUMENTATION RUNTIME STATISTICS\n\nIndCallDescSize: ");
751 StatPtr = intToStr(StatPtr,
752 Ctx.FuncDescriptions -
753 reinterpret_cast<uint8_t *>(Ctx.IndCallDescriptions),
754 10);
755 StatPtr = strCopy(StatPtr, "\nFuncDescSize: ");
756 StatPtr = intToStr(
757 StatPtr,
758 reinterpret_cast<uint8_t *>(Ctx.Strings) - Ctx.FuncDescriptions, 10);
759 StatPtr = strCopy(StatPtr, "\n__bolt_instr_num_ind_calls: ");
760 StatPtr = intToStr(StatPtr, __bolt_instr_num_ind_calls, 10);
761 StatPtr = strCopy(StatPtr, "\n__bolt_instr_num_funcs: ");
762 StatPtr = intToStr(StatPtr, __bolt_instr_num_funcs, 10);
763 StatPtr = strCopy(StatPtr, "\n");
764 __write(2, StatMsg, StatPtr - StatMsg);
765}
766#endif
767
768
769/// This is part of a simple CFG representation in memory, where we store
770/// a dynamically sized array of input and output edges per node, and store
771/// a dynamically sized array of nodes per graph. We also store the spanning
772/// tree edges for that CFG in a separate array of nodes in
773/// \p SpanningTreeNodes, while the regular nodes live in \p CFGNodes.
774struct Edge {
775 uint32_t Node; // Index in nodes array regarding the destination of this edge
776 uint32_t ID; // Edge index in an array comprising all edges of the graph
777};
778
779/// A regular graph node or a spanning tree node
780struct Node {
781 uint32_t NumInEdges{0}; // Input edge count used to size InEdge
782 uint32_t NumOutEdges{0}; // Output edge count used to size OutEdges
783 Edge *InEdges{nullptr}; // Created and managed by \p Graph
784 Edge *OutEdges{nullptr}; // ditto
785};
786
787/// Main class for CFG representation in memory. Manages object creation and
788/// destruction, populates an array of CFG nodes as well as corresponding
789/// spanning tree nodes.
790struct Graph {
791 uint32_t NumNodes;
792 Node *CFGNodes;
793 Node *SpanningTreeNodes;
794 uint64_t *EdgeFreqs;
795 uint64_t *CallFreqs;
796 BumpPtrAllocator &Alloc;
797 const FunctionDescription &D;
798
799 /// Reads a list of edges from function description \p D and builds
800 /// the graph from it. Allocates several internal dynamic structures that are
801 /// later destroyed by ~Graph() and uses \p Alloc. D.LeafNodes contain all
802 /// spanning tree leaf nodes descriptions (their counters). They are the seed
803 /// used to compute the rest of the missing edge counts in a bottom-up
804 /// traversal of the spanning tree.
805 Graph(BumpPtrAllocator &Alloc, const FunctionDescription &D,
806 const uint64_t *Counters, ProfileWriterContext &Ctx);
807 ~Graph();
808 void dump() const;
809
810private:
811 void computeEdgeFrequencies(const uint64_t *Counters,
812 ProfileWriterContext &Ctx);
813 void dumpEdgeFreqs() const;
814};
815
816Graph::Graph(BumpPtrAllocator &Alloc, const FunctionDescription &D,
817 const uint64_t *Counters, ProfileWriterContext &Ctx)
818 : Alloc(Alloc), D(D) {
819 DEBUG(reportNumber("G = 0x", (uint64_t)this, 16)){};
820 // First pass to determine number of nodes
821 int32_t MaxNodes = -1;
822 CallFreqs = nullptr;
823 EdgeFreqs = nullptr;
824 for (int I = 0; I < D.NumEdges; ++I) {
825 if (static_cast<int32_t>(D.Edges[I].FromNode) > MaxNodes)
826 MaxNodes = D.Edges[I].FromNode;
827 if (static_cast<int32_t>(D.Edges[I].ToNode) > MaxNodes)
828 MaxNodes = D.Edges[I].ToNode;
829 }
830
831 for (int I = 0; I < D.NumLeafNodes; ++I)
832 if (static_cast<int32_t>(D.LeafNodes[I].Node) > MaxNodes)
833 MaxNodes = D.LeafNodes[I].Node;
834
835 for (int I = 0; I < D.NumCalls; ++I)
836 if (static_cast<int32_t>(D.Calls[I].FromNode) > MaxNodes)
837 MaxNodes = D.Calls[I].FromNode;
838
839 // No nodes? Nothing to do
840 if (MaxNodes < 0) {
841 DEBUG(report("No nodes!\n")){};
842 CFGNodes = nullptr;
843 SpanningTreeNodes = nullptr;
844 NumNodes = 0;
845 return;
846 }
847 ++MaxNodes;
848 DEBUG(reportNumber("NumNodes = ", MaxNodes, 10)){};
849 NumNodes = static_cast<uint32_t>(MaxNodes);
850
851 // Initial allocations
852 CFGNodes = new (Alloc) Node[MaxNodes];
853
854 DEBUG(reportNumber("G->CFGNodes = 0x", (uint64_t)CFGNodes, 16)){};
855 SpanningTreeNodes = new (Alloc) Node[MaxNodes];
856 DEBUG(reportNumber("G->SpanningTreeNodes = 0x",{}
857 (uint64_t)SpanningTreeNodes, 16)){};
858
859 // Figure out how much to allocate to each vector (in/out edge sets)
860 for (int I = 0; I < D.NumEdges; ++I) {
861 CFGNodes[D.Edges[I].FromNode].NumOutEdges++;
862 CFGNodes[D.Edges[I].ToNode].NumInEdges++;
863 if (D.Edges[I].Counter != 0xffffffff)
864 continue;
865
866 SpanningTreeNodes[D.Edges[I].FromNode].NumOutEdges++;
867 SpanningTreeNodes[D.Edges[I].ToNode].NumInEdges++;
868 }
869
870 // Allocate in/out edge sets
871 for (int I = 0; I < MaxNodes; ++I) {
872 if (CFGNodes[I].NumInEdges > 0)
873 CFGNodes[I].InEdges = new (Alloc) Edge[CFGNodes[I].NumInEdges];
874 if (CFGNodes[I].NumOutEdges > 0)
875 CFGNodes[I].OutEdges = new (Alloc) Edge[CFGNodes[I].NumOutEdges];
876 if (SpanningTreeNodes[I].NumInEdges > 0)
877 SpanningTreeNodes[I].InEdges =
878 new (Alloc) Edge[SpanningTreeNodes[I].NumInEdges];
879 if (SpanningTreeNodes[I].NumOutEdges > 0)
880 SpanningTreeNodes[I].OutEdges =
881 new (Alloc) Edge[SpanningTreeNodes[I].NumOutEdges];
882 CFGNodes[I].NumInEdges = 0;
883 CFGNodes[I].NumOutEdges = 0;
884 SpanningTreeNodes[I].NumInEdges = 0;
885 SpanningTreeNodes[I].NumOutEdges = 0;
886 }
887
888 // Fill in/out edge sets
889 for (int I = 0; I < D.NumEdges; ++I) {
890 const uint32_t Src = D.Edges[I].FromNode;
891 const uint32_t Dst = D.Edges[I].ToNode;
892 Edge *E = &CFGNodes[Src].OutEdges[CFGNodes[Src].NumOutEdges++];
893 E->Node = Dst;
894 E->ID = I;
895
896 E = &CFGNodes[Dst].InEdges[CFGNodes[Dst].NumInEdges++];
897 E->Node = Src;
898 E->ID = I;
899
900 if (D.Edges[I].Counter != 0xffffffff)
901 continue;
902
903 E = &SpanningTreeNodes[Src]
904 .OutEdges[SpanningTreeNodes[Src].NumOutEdges++];
905 E->Node = Dst;
906 E->ID = I;
907
908 E = &SpanningTreeNodes[Dst]
909 .InEdges[SpanningTreeNodes[Dst].NumInEdges++];
910 E->Node = Src;
911 E->ID = I;
912 }
913
914 computeEdgeFrequencies(Counters, Ctx);
915}
916
917Graph::~Graph() {
918 if (CallFreqs)
919 Alloc.deallocate(CallFreqs);
920 if (EdgeFreqs)
921 Alloc.deallocate(EdgeFreqs);
922 for (int I = NumNodes - 1; I >= 0; --I) {
923 if (SpanningTreeNodes[I].OutEdges)
924 Alloc.deallocate(SpanningTreeNodes[I].OutEdges);
925 if (SpanningTreeNodes[I].InEdges)
926 Alloc.deallocate(SpanningTreeNodes[I].InEdges);
927 if (CFGNodes[I].OutEdges)
928 Alloc.deallocate(CFGNodes[I].OutEdges);
929 if (CFGNodes[I].InEdges)
930 Alloc.deallocate(CFGNodes[I].InEdges);
931 }
932 if (SpanningTreeNodes)
933 Alloc.deallocate(SpanningTreeNodes);
934 if (CFGNodes)
935 Alloc.deallocate(CFGNodes);
936}
937
938void Graph::dump() const {
939 reportNumber("Dumping graph with number of nodes: ", NumNodes, 10);
940 report(" Full graph:\n");
941 for (int I = 0; I < NumNodes; ++I) {
942 const Node *N = &CFGNodes[I];
943 reportNumber(" Node #", I, 10);
944 reportNumber(" InEdges total ", N->NumInEdges, 10);
945 for (int J = 0; J < N->NumInEdges; ++J)
946 reportNumber(" ", N->InEdges[J].Node, 10);
947 reportNumber(" OutEdges total ", N->NumOutEdges, 10);
948 for (int J = 0; J < N->NumOutEdges; ++J)
949 reportNumber(" ", N->OutEdges[J].Node, 10);
950 report("\n");
951 }
952 report(" Spanning tree:\n");
953 for (int I = 0; I < NumNodes; ++I) {
954 const Node *N = &SpanningTreeNodes[I];
955 reportNumber(" Node #", I, 10);
956 reportNumber(" InEdges total ", N->NumInEdges, 10);
957 for (int J = 0; J < N->NumInEdges; ++J)
958 reportNumber(" ", N->InEdges[J].Node, 10);
959 reportNumber(" OutEdges total ", N->NumOutEdges, 10);
960 for (int J = 0; J < N->NumOutEdges; ++J)
961 reportNumber(" ", N->OutEdges[J].Node, 10);
962 report("\n");
963 }
964}
965
966void Graph::dumpEdgeFreqs() const {
967 reportNumber(
968 "Dumping edge frequencies for graph with num edges: ", D.NumEdges, 10);
969 for (int I = 0; I < D.NumEdges; ++I) {
970 reportNumber("* Src: ", D.Edges[I].FromNode, 10);
971 reportNumber(" Dst: ", D.Edges[I].ToNode, 10);
972 reportNumber(" Cnt: ", EdgeFreqs[I], 10);
973 }
974}
975
976/// Auxiliary map structure for fast lookups of which calls map to each node of
977/// the function CFG
978struct NodeToCallsMap {
979 struct MapEntry {
980 uint32_t NumCalls;
981 uint32_t *Calls;
982 };
983 MapEntry *Entries;
984 BumpPtrAllocator &Alloc;
985 const uint32_t NumNodes;
986
987 NodeToCallsMap(BumpPtrAllocator &Alloc, const FunctionDescription &D,
988 uint32_t NumNodes)
989 : Alloc(Alloc), NumNodes(NumNodes) {
990 Entries = new (Alloc, 0) MapEntry[NumNodes];
991 for (int I = 0; I < D.NumCalls; ++I) {
992 DEBUG(reportNumber("Registering call in node ", D.Calls[I].FromNode, 10)){};
993 ++Entries[D.Calls[I].FromNode].NumCalls;
994 }
995 for (int I = 0; I < NumNodes; ++I) {
996 Entries[I].Calls = Entries[I].NumCalls ? new (Alloc)
997 uint32_t[Entries[I].NumCalls]
998 : nullptr;
999 Entries[I].NumCalls = 0;
1000 }
1001 for (int I = 0; I < D.NumCalls; ++I) {
1002 MapEntry &Entry = Entries[D.Calls[I].FromNode];
1003 Entry.Calls[Entry.NumCalls++] = I;
1004 }
1005 }
1006
1007 /// Set the frequency of all calls in node \p NodeID to Freq. However, if
1008 /// the calls have their own counters and do not depend on the basic block
1009 /// counter, this means they have landing pads and throw exceptions. In this
1010 /// case, set their frequency with their counters and return the maximum
1011 /// value observed in such counters. This will be used as the new frequency
1012 /// at basic block entry. This is used to fix the CFG edge frequencies in the
1013 /// presence of exceptions.
1014 uint64_t visitAllCallsIn(uint32_t NodeID, uint64_t Freq, uint64_t *CallFreqs,
1015 const FunctionDescription &D,
1016 const uint64_t *Counters,
1017 ProfileWriterContext &Ctx) const {
1018 const MapEntry &Entry = Entries[NodeID];
1019 uint64_t MaxValue = 0ull;
1020 for (int I = 0, E = Entry.NumCalls; I != E; ++I) {
1021 const uint32_t CallID = Entry.Calls[I];
1022 DEBUG(reportNumber(" Setting freq for call ID: ", CallID, 10)){};
1023 const CallDescription &CallDesc = D.Calls[CallID];
1024 if (CallDesc.Counter == 0xffffffff) {
1025 CallFreqs[CallID] = Freq;
1026 DEBUG(reportNumber(" with : ", Freq, 10)){};
1027 } else {
1028 const uint64_t CounterVal = Counters[CallDesc.Counter];
1029 CallFreqs[CallID] = CounterVal;
1030 MaxValue = CounterVal > MaxValue ? CounterVal : MaxValue;
1031 DEBUG(reportNumber(" with (private counter) : ", CounterVal, 10)){};
1032 }
1033 DEBUG(reportNumber(" Address: 0x", CallDesc.TargetAddress, 16)){};
1034 if (CallFreqs[CallID] > 0)
1035 Ctx.CallFlowTable->get(CallDesc.TargetAddress).Calls +=
1036 CallFreqs[CallID];
1037 }
1038 return MaxValue;
1039 }
1040
1041 ~NodeToCallsMap() {
1042 for (int I = NumNodes - 1; I >= 0; --I)
1043 if (Entries[I].Calls)
1044 Alloc.deallocate(Entries[I].Calls);
1045 Alloc.deallocate(Entries);
1046 }
1047};
1048
1049/// Fill an array with the frequency of each edge in the function represented
1050/// by G, as well as another array for each call.
1051void Graph::computeEdgeFrequencies(const uint64_t *Counters,
1052 ProfileWriterContext &Ctx) {
1053 if (NumNodes == 0)
1054 return;
1055
1056 EdgeFreqs = D.NumEdges ? new (Alloc, 0) uint64_t [D.NumEdges] : nullptr;
1057 CallFreqs = D.NumCalls ? new (Alloc, 0) uint64_t [D.NumCalls] : nullptr;
1058
1059 // Setup a lookup for calls present in each node (BB)
1060 NodeToCallsMap *CallMap = new (Alloc) NodeToCallsMap(Alloc, D, NumNodes);
1061
1062 // Perform a bottom-up, BFS traversal of the spanning tree in G. Edges in the
1063 // spanning tree don't have explicit counters. We must infer their value using
1064 // a linear combination of other counters (sum of counters of the outgoing
1065 // edges minus sum of counters of the incoming edges).
1066 uint32_t *Stack = new (Alloc) uint32_t [NumNodes];
1067 uint32_t StackTop = 0;
1068 enum Status : uint8_t { S_NEW = 0, S_VISITING, S_VISITED };
1069 Status *Visited = new (Alloc, 0) Status[NumNodes];
1070 uint64_t *LeafFrequency = new (Alloc, 0) uint64_t[NumNodes];
1071 uint64_t *EntryAddress = new (Alloc, 0) uint64_t[NumNodes];
1072
1073 // Setup a fast lookup for frequency of leaf nodes, which have special
1074 // basic block frequency instrumentation (they are not edge profiled).
1075 for (int I = 0; I < D.NumLeafNodes; ++I) {
1076 LeafFrequency[D.LeafNodes[I].Node] = Counters[D.LeafNodes[I].Counter];
1077 DEBUG({{}
1078 if (Counters[D.LeafNodes[I].Counter] > 0) {{}
1079 reportNumber("Leaf Node# ", D.LeafNodes[I].Node, 10);{}
1080 reportNumber(" Counter: ", Counters[D.LeafNodes[I].Counter], 10);{}
1081 }{}
1082 }){};
1083 }
1084 for (int I = 0; I < D.NumEntryNodes; ++I) {
1085 EntryAddress[D.EntryNodes[I].Node] = D.EntryNodes[I].Address;
1086 DEBUG({{}
1087 reportNumber("Entry Node# ", D.EntryNodes[I].Node, 10);{}
1088 reportNumber(" Address: ", D.EntryNodes[I].Address, 16);{}
1089 }){};
1090 }
1091 // Add all root nodes to the stack
1092 for (int I = 0; I < NumNodes; ++I)
1093 if (SpanningTreeNodes[I].NumInEdges == 0)
1094 Stack[StackTop++] = I;
1095
1096 // Empty stack?
1097 if (StackTop == 0) {
1098 DEBUG(report("Empty stack!\n")){};
1099 Alloc.deallocate(EntryAddress);
1100 Alloc.deallocate(LeafFrequency);
1101 Alloc.deallocate(Visited);
1102 Alloc.deallocate(Stack);
1103 CallMap->~NodeToCallsMap();
1104 Alloc.deallocate(CallMap);
1105 if (CallFreqs)
1106 Alloc.deallocate(CallFreqs);
1107 if (EdgeFreqs)
1108 Alloc.deallocate(EdgeFreqs);
1109 EdgeFreqs = nullptr;
1110 CallFreqs = nullptr;
1111 return;
1112 }
1113 // Add all known edge counts, will infer the rest
1114 for (int I = 0; I < D.NumEdges; ++I) {
1115 const uint32_t C = D.Edges[I].Counter;
1116 if (C == 0xffffffff) // inferred counter - we will compute its value
1117 continue;
1118 EdgeFreqs[I] = Counters[C];
1119 }
1120
1121 while (StackTop > 0) {
1122 const uint32_t Cur = Stack[--StackTop];
1123 DEBUG({{}
1124 if (Visited[Cur] == S_VISITING){}
1125 report("(visiting) ");{}
1126 else{}
1127 report("(new) ");{}
1128 reportNumber("Cur: ", Cur, 10);{}
1129 }){};
1130
1131 // This shouldn't happen in a tree
1132 assert(Visited[Cur] != S_VISITED, "should not have visited nodes in stack");
1133 if (Visited[Cur] == S_NEW) {
1134 Visited[Cur] = S_VISITING;
1135 Stack[StackTop++] = Cur;
1136 assert(StackTop <= NumNodes, "stack grew too large");
1137 for (int I = 0, E = SpanningTreeNodes[Cur].NumOutEdges; I < E; ++I) {
1138 const uint32_t Succ = SpanningTreeNodes[Cur].OutEdges[I].Node;
1139 Stack[StackTop++] = Succ;
1140 assert(StackTop <= NumNodes, "stack grew too large");
1141 }
1142 continue;
1143 }
1144 Visited[Cur] = S_VISITED;
1145
1146 // Establish our node frequency based on outgoing edges, which should all be
1147 // resolved by now.
1148 int64_t CurNodeFreq = LeafFrequency[Cur];
1149 // Not a leaf?
1150 if (!CurNodeFreq) {
1151 for (int I = 0, E = CFGNodes[Cur].NumOutEdges; I != E; ++I) {
1152 const uint32_t SuccEdge = CFGNodes[Cur].OutEdges[I].ID;
1153 CurNodeFreq += EdgeFreqs[SuccEdge];
1154 }
1155 }
1156 if (CurNodeFreq < 0)
1157 CurNodeFreq = 0;
1158
1159 const uint64_t CallFreq = CallMap->visitAllCallsIn(
1160 Cur, CurNodeFreq > 0 ? CurNodeFreq : 0, CallFreqs, D, Counters, Ctx);
1161
1162 // Exception handling affected our output flow? Fix with calls info
1163 DEBUG({{}
1164 if (CallFreq > CurNodeFreq){}
1165 report("Bumping node frequency with call info\n");{}
1166 }){};
1167 CurNodeFreq = CallFreq > CurNodeFreq ? CallFreq : CurNodeFreq;
1168
1169 if (CurNodeFreq > 0) {
1170 if (uint64_t Addr = EntryAddress[Cur]) {
1171 DEBUG({}
1172 reportNumber(" Setting flow at entry point address 0x", Addr, 16)){};
1173 DEBUG(reportNumber(" with: ", CurNodeFreq, 10)){};
1174 Ctx.CallFlowTable->get(Addr).Val = CurNodeFreq;
1175 }
1176 }
1177
1178 // No parent? Reached a tree root, limit to call frequency updating.
1179 if (SpanningTreeNodes[Cur].NumInEdges == 0)
1180 continue;
1181
1182 assert(SpanningTreeNodes[Cur].NumInEdges == 1, "must have 1 parent");
1183 const uint32_t Parent = SpanningTreeNodes[Cur].InEdges[0].Node;
Value stored to 'Parent' during its initialization is never read
1184 const uint32_t ParentEdge = SpanningTreeNodes[Cur].InEdges[0].ID;
1185
1186 // Calculate parent edge freq.
1187 int64_t ParentEdgeFreq = CurNodeFreq;
1188 for (int I = 0, E = CFGNodes[Cur].NumInEdges; I != E; ++I) {
1189 const uint32_t PredEdge = CFGNodes[Cur].InEdges[I].ID;
1190 ParentEdgeFreq -= EdgeFreqs[PredEdge];
1191 }
1192
1193 // Sometimes the conservative CFG that BOLT builds will lead to incorrect
1194 // flow computation. For example, in a BB that transitively calls the exit
1195 // syscall, BOLT will add a fall-through successor even though it should not
1196 // have any successors. So this block execution will likely be wrong. We
1197 // tolerate this imperfection since this case should be quite infrequent.
1198 if (ParentEdgeFreq < 0) {
1199 DEBUG(dumpEdgeFreqs()){};
1200 DEBUG(report("WARNING: incorrect flow")){};
1201 ParentEdgeFreq = 0;
1202 }
1203 DEBUG(reportNumber(" Setting freq for ParentEdge: ", ParentEdge, 10)){};
1204 DEBUG(reportNumber(" with ParentEdgeFreq: ", ParentEdgeFreq, 10)){};
1205 EdgeFreqs[ParentEdge] = ParentEdgeFreq;
1206 }
1207
1208 Alloc.deallocate(EntryAddress);
1209 Alloc.deallocate(LeafFrequency);
1210 Alloc.deallocate(Visited);
1211 Alloc.deallocate(Stack);
1212 CallMap->~NodeToCallsMap();
1213 Alloc.deallocate(CallMap);
1214 DEBUG(dumpEdgeFreqs()){};
1215}
1216
1217/// Write to \p FD all of the edge profiles for function \p FuncDesc. Uses
1218/// \p Alloc to allocate helper dynamic structures used to compute profile for
1219/// edges that we do not explictly instrument.
1220const uint8_t *writeFunctionProfile(int FD, ProfileWriterContext &Ctx,
1221 const uint8_t *FuncDesc,
1222 BumpPtrAllocator &Alloc) {
1223 const FunctionDescription F(FuncDesc);
1224 const uint8_t *next = FuncDesc + F.getSize();
1225
1226#if !defined(__APPLE__)
1227 uint64_t *bolt_instr_locations = __bolt_instr_locations;
1228#else
1229 uint64_t *bolt_instr_locations = _bolt_instr_locations_getter();
1230#endif
1231
1232 // Skip funcs we know are cold
1233#ifndef ENABLE_DEBUG
1234 uint64_t CountersFreq = 0;
1235 for (int I = 0; I < F.NumLeafNodes; ++I)
1236 CountersFreq += bolt_instr_locations[F.LeafNodes[I].Counter];
1237
1238 if (CountersFreq == 0) {
1239 for (int I = 0; I < F.NumEdges; ++I) {
1240 const uint32_t C = F.Edges[I].Counter;
1241 if (C == 0xffffffff)
1242 continue;
1243 CountersFreq += bolt_instr_locations[C];
1244 }
1245 if (CountersFreq == 0) {
1246 for (int I = 0; I < F.NumCalls; ++I) {
1247 const uint32_t C = F.Calls[I].Counter;
1248 if (C == 0xffffffff)
1249 continue;
1250 CountersFreq += bolt_instr_locations[C];
1251 }
1252 if (CountersFreq == 0)
1253 return next;
1254 }
1255 }
1256#endif
1257
1258 Graph *G = new (Alloc) Graph(Alloc, F, bolt_instr_locations, Ctx);
1259 DEBUG(G->dump()){};
1260
1261 if (!G->EdgeFreqs && !G->CallFreqs) {
1262 G->~Graph();
1263 Alloc.deallocate(G);
1264 return next;
1265 }
1266
1267 for (int I = 0; I < F.NumEdges; ++I) {
1268 const uint64_t Freq = G->EdgeFreqs[I];
1269 if (Freq == 0)
1270 continue;
1271 const EdgeDescription *Desc = &F.Edges[I];
1272 char LineBuf[BufSize];
1273 char *Ptr = LineBuf;
1274 Ptr = serializeLoc(Ctx, Ptr, Desc->From, BufSize);
1275 Ptr = serializeLoc(Ctx, Ptr, Desc->To, BufSize - (Ptr - LineBuf));
1276 Ptr = strCopy(Ptr, "0 ", BufSize - (Ptr - LineBuf) - 22);
1277 Ptr = intToStr(Ptr, Freq, 10);
1278 *Ptr++ = '\n';
1279 __write(FD, LineBuf, Ptr - LineBuf);
1280 }
1281
1282 for (int I = 0; I < F.NumCalls; ++I) {
1283 const uint64_t Freq = G->CallFreqs[I];
1284 if (Freq == 0)
1285 continue;
1286 char LineBuf[BufSize];
1287 char *Ptr = LineBuf;
1288 const CallDescription *Desc = &F.Calls[I];
1289 Ptr = serializeLoc(Ctx, Ptr, Desc->From, BufSize);
1290 Ptr = serializeLoc(Ctx, Ptr, Desc->To, BufSize - (Ptr - LineBuf));
1291 Ptr = strCopy(Ptr, "0 ", BufSize - (Ptr - LineBuf) - 25);
1292 Ptr = intToStr(Ptr, Freq, 10);
1293 *Ptr++ = '\n';
1294 __write(FD, LineBuf, Ptr - LineBuf);
1295 }
1296
1297 G->~Graph();
1298 Alloc.deallocate(G);
1299 return next;
1300}
1301
1302#if !defined(__APPLE__)
1303const IndCallTargetDescription *
1304ProfileWriterContext::lookupIndCallTarget(uint64_t Target) const {
1305 uint32_t B = 0;
1306 uint32_t E = __bolt_instr_num_ind_targets;
1307 if (E == 0)
1308 return nullptr;
1309 do {
1310 uint32_t I = (E - B) / 2 + B;
1311 if (IndCallTargets[I].Address == Target)
1312 return &IndCallTargets[I];
1313 if (IndCallTargets[I].Address < Target)
1314 B = I + 1;
1315 else
1316 E = I;
1317 } while (B < E);
1318 return nullptr;
1319}
1320
1321/// Write a single indirect call <src, target> pair to the fdata file
1322void visitIndCallCounter(IndirectCallHashTable::MapEntry &Entry,
1323 int FD, int CallsiteID,
1324 ProfileWriterContext *Ctx) {
1325 if (Entry.Val == 0)
1326 return;
1327 DEBUG(reportNumber("Target func 0x", Entry.Key, 16)){};
1328 DEBUG(reportNumber("Target freq: ", Entry.Val, 10)){};
1329 const IndCallDescription *CallsiteDesc =
1330 &Ctx->IndCallDescriptions[CallsiteID];
1331 const IndCallTargetDescription *TargetDesc =
1332 Ctx->lookupIndCallTarget(Entry.Key);
1333 if (!TargetDesc) {
1334 DEBUG(report("Failed to lookup indirect call target\n")){};
1335 char LineBuf[BufSize];
1336 char *Ptr = LineBuf;
1337 Ptr = serializeLoc(*Ctx, Ptr, *CallsiteDesc, BufSize);
1338 Ptr = strCopy(Ptr, "0 [unknown] 0 0 ", BufSize - (Ptr - LineBuf) - 40);
1339 Ptr = intToStr(Ptr, Entry.Val, 10);
1340 *Ptr++ = '\n';
1341 __write(FD, LineBuf, Ptr - LineBuf);
1342 return;
1343 }
1344 Ctx->CallFlowTable->get(TargetDesc->Address).Calls += Entry.Val;
1345 char LineBuf[BufSize];
1346 char *Ptr = LineBuf;
1347 Ptr = serializeLoc(*Ctx, Ptr, *CallsiteDesc, BufSize);
1348 Ptr = serializeLoc(*Ctx, Ptr, TargetDesc->Loc, BufSize - (Ptr - LineBuf));
1349 Ptr = strCopy(Ptr, "0 ", BufSize - (Ptr - LineBuf) - 25);
1350 Ptr = intToStr(Ptr, Entry.Val, 10);
1351 *Ptr++ = '\n';
1352 __write(FD, LineBuf, Ptr - LineBuf);
1353}
1354
1355/// Write to \p FD all of the indirect call profiles.
1356void writeIndirectCallProfile(int FD, ProfileWriterContext &Ctx) {
1357 for (int I = 0; I < __bolt_instr_num_ind_calls; ++I) {
1358 DEBUG(reportNumber("IndCallsite #", I, 10)){};
1359 GlobalIndCallCounters[I].forEachElement(visitIndCallCounter, FD, I, &Ctx);
1360 }
1361}
1362
1363/// Check a single call flow for a callee versus all known callers. If there are
1364/// less callers than what the callee expects, write the difference with source
1365/// [unknown] in the profile.
1366void visitCallFlowEntry(CallFlowHashTable::MapEntry &Entry, int FD,
1367 ProfileWriterContext *Ctx) {
1368 DEBUG(reportNumber("Call flow entry address: 0x", Entry.Key, 16)){};
1369 DEBUG(reportNumber("Calls: ", Entry.Calls, 10)){};
1370 DEBUG(reportNumber("Reported entry frequency: ", Entry.Val, 10)){};
1371 DEBUG({{}
1372 if (Entry.Calls > Entry.Val){}
1373 report(" More calls than expected!\n");{}
1374 }){};
1375 if (Entry.Val <= Entry.Calls)
1376 return;
1377 DEBUG(reportNumber({}
1378 " Balancing calls with traffic: ", Entry.Val - Entry.Calls, 10)){};
1379 const IndCallTargetDescription *TargetDesc =
1380 Ctx->lookupIndCallTarget(Entry.Key);
1381 if (!TargetDesc) {
1382 // There is probably something wrong with this callee and this should be
1383 // investigated, but I don't want to assert and lose all data collected.
1384 DEBUG(report("WARNING: failed to look up call target!\n")){};
1385 return;
1386 }
1387 char LineBuf[BufSize];
1388 char *Ptr = LineBuf;
1389 Ptr = strCopy(Ptr, "0 [unknown] 0 ", BufSize);
1390 Ptr = serializeLoc(*Ctx, Ptr, TargetDesc->Loc, BufSize - (Ptr - LineBuf));
1391 Ptr = strCopy(Ptr, "0 ", BufSize - (Ptr - LineBuf) - 25);
1392 Ptr = intToStr(Ptr, Entry.Val - Entry.Calls, 10);
1393 *Ptr++ = '\n';
1394 __write(FD, LineBuf, Ptr - LineBuf);
1395}
1396
1397/// Open fdata file for writing and return a valid file descriptor, aborting
1398/// program upon failure.
1399int openProfile() {
1400 // Build the profile name string by appending our PID
1401 char Buf[BufSize];
1402 char *Ptr = Buf;
1403 uint64_t PID = __getpid();
1404 Ptr = strCopy(Buf, __bolt_instr_filename, BufSize);
1405 if (__bolt_instr_use_pid) {
1406 Ptr = strCopy(Ptr, ".", BufSize - (Ptr - Buf + 1));
1407 Ptr = intToStr(Ptr, PID, 10);
1408 Ptr = strCopy(Ptr, ".fdata", BufSize - (Ptr - Buf + 1));
1409 }
1410 *Ptr++ = '\0';
1411 uint64_t FD = __open(Buf,
1412 /*flags=*/0x241 /*O_WRONLY|O_TRUNC|O_CREAT*/,
1413 /*mode=*/0666);
1414 if (static_cast<int64_t>(FD) < 0) {
1415 report("Error while trying to open profile file for writing: ");
1416 report(Buf);
1417 reportNumber("\nFailed with error number: 0x",
1418 0 - static_cast<int64_t>(FD), 16);
1419 __exit(1);
1420 }
1421 return FD;
1422}
1423
1424#endif
1425
1426} // anonymous namespace
1427
1428#if !defined(__APPLE__)
1429
1430/// Reset all counters in case you want to start profiling a new phase of your
1431/// program independently of prior phases.
1432/// The address of this function is printed by BOLT and this can be called by
1433/// any attached debugger during runtime. There is a useful oneliner for gdb:
1434///
1435/// gdb -p $(pgrep -xo PROCESSNAME) -ex 'p ((void(*)())0xdeadbeef)()' \
1436/// -ex 'set confirm off' -ex quit
1437///
1438/// Where 0xdeadbeef is this function address and PROCESSNAME your binary file
1439/// name.
1440extern "C" void __bolt_instr_clear_counters() {
1441 memset(reinterpret_cast<char *>(__bolt_instr_locations), 0,
1442 __bolt_num_counters * 8);
1443 for (int I = 0; I < __bolt_instr_num_ind_calls; ++I)
1444 GlobalIndCallCounters[I].resetCounters();
1445}
1446
1447/// This is the entry point for profile writing.
1448/// There are three ways of getting here:
1449///
1450/// * Program execution ended, finalization methods are running and BOLT
1451/// hooked into FINI from your binary dynamic section;
1452/// * You used the sleep timer option and during initialization we forked
1453/// a separete process that will call this function periodically;
1454/// * BOLT prints this function address so you can attach a debugger and
1455/// call this function directly to get your profile written to disk
1456/// on demand.
1457///
1458extern "C" void __attribute((force_align_arg_pointer))
1459__bolt_instr_data_dump() {
1460 // Already dumping
1461 if (!GlobalWriteProfileMutex->acquire())
1462 return;
1463
1464 BumpPtrAllocator HashAlloc;
1465 HashAlloc.setMaxSize(0x6400000);
1466 ProfileWriterContext Ctx = readDescriptions();
1467 Ctx.CallFlowTable = new (HashAlloc, 0) CallFlowHashTable(HashAlloc);
1468
1469 DEBUG(printStats(Ctx)){};
1470
1471 int FD = openProfile();
1472
1473 BumpPtrAllocator Alloc;
1474 const uint8_t *FuncDesc = Ctx.FuncDescriptions;
1475 for (int I = 0, E = __bolt_instr_num_funcs; I < E; ++I) {
1476 FuncDesc = writeFunctionProfile(FD, Ctx, FuncDesc, Alloc);
1477 Alloc.clear();
1478 DEBUG(reportNumber("FuncDesc now: ", (uint64_t)FuncDesc, 16)){};
1479 }
1480 assert(FuncDesc == (void *)Ctx.Strings,
1481 "FuncDesc ptr must be equal to stringtable");
1482
1483 writeIndirectCallProfile(FD, Ctx);
1484 Ctx.CallFlowTable->forEachElement(visitCallFlowEntry, FD, &Ctx);
1485
1486 __fsync(FD);
1487 __close(FD);
1488 __munmap(Ctx.MMapPtr, Ctx.MMapSize);
1489 __close(Ctx.FileDesc);
1490 HashAlloc.destroy();
1491 GlobalWriteProfileMutex->release();
1492 DEBUG(report("Finished writing profile.\n")){};
1493}
1494
1495/// Event loop for our child process spawned during setup to dump profile data
1496/// at user-specified intervals
1497void watchProcess() {
1498 timespec ts, rem;
1499 uint64_t Ellapsed = 0ull;
1500 uint64_t ppid;
1501 if (__bolt_instr_wait_forks) {
1502 // Store parent pgid
1503 ppid = -__getpgid(0);
1504 // And leave parent process group
1505 __setpgid(0, 0);
1506 } else {
1507 // Store parent pid
1508 ppid = __getppid();
1509 if (ppid == 1) {
1510 // Parent already dead
1511 __bolt_instr_data_dump();
1512 goto out;
1513 }
1514 }
1515
1516 ts.tv_sec = 1;
1517 ts.tv_nsec = 0;
1518 while (1) {
1519 __nanosleep(&ts, &rem);
1520 // This means our parent process or all its forks are dead,
1521 // so no need for us to keep dumping.
1522 if (__kill(ppid, 0) < 0) {
1523 if (__bolt_instr_no_counters_clear)
1524 __bolt_instr_data_dump();
1525 break;
1526 }
1527
1528 if (++Ellapsed < __bolt_instr_sleep_time)
1529 continue;
1530
1531 Ellapsed = 0;
1532 __bolt_instr_data_dump();
1533 if (__bolt_instr_no_counters_clear == false)
1534 __bolt_instr_clear_counters();
1535 }
1536
1537out:;
1538 DEBUG(report("My parent process is dead, bye!\n")){};
1539 __exit(0);
1540}
1541
1542extern "C" void __bolt_instr_indirect_call();
1543extern "C" void __bolt_instr_indirect_tailcall();
1544
1545/// Initialization code
1546extern "C" void __attribute((force_align_arg_pointer)) __bolt_instr_setup() {
1547 const uint64_t CountersStart =
1548 reinterpret_cast<uint64_t>(&__bolt_instr_locations[0]);
1549 const uint64_t CountersEnd = alignTo(
1550 reinterpret_cast<uint64_t>(&__bolt_instr_locations[__bolt_num_counters]),
1551 0x1000);
1552 DEBUG(reportNumber("replace mmap start: ", CountersStart, 16)){};
1553 DEBUG(reportNumber("replace mmap stop: ", CountersEnd, 16)){};
1554 assert (CountersEnd > CountersStart, "no counters");
1555 // Maps our counters to be shared instead of private, so we keep counting for
1556 // forked processes
1557 __mmap(CountersStart, CountersEnd - CountersStart,
1558 0x3 /*PROT_READ|PROT_WRITE*/,
1559 0x31 /*MAP_ANONYMOUS | MAP_SHARED | MAP_FIXED*/, -1, 0);
1560
1561 __bolt_ind_call_counter_func_pointer = __bolt_instr_indirect_call;
1562 __bolt_ind_tailcall_counter_func_pointer = __bolt_instr_indirect_tailcall;
1563 // Conservatively reserve 100MiB shared pages
1564 GlobalAlloc.setMaxSize(0x6400000);
1565 GlobalAlloc.setShared(true);
1566 GlobalWriteProfileMutex = new (GlobalAlloc, 0) Mutex();
1567 if (__bolt_instr_num_ind_calls > 0)
1568 GlobalIndCallCounters =
1569 new (GlobalAlloc, 0) IndirectCallHashTable[__bolt_instr_num_ind_calls];
1570
1571 if (__bolt_instr_sleep_time != 0) {
1572 // Separate instrumented process to the own process group
1573 if (__bolt_instr_wait_forks)
1574 __setpgid(0, 0);
1575
1576 if (long PID = __fork())
1577 return;
1578 watchProcess();
1579 }
1580}
1581
1582extern "C" __attribute((force_align_arg_pointer)) void
1583instrumentIndirectCall(uint64_t Target, uint64_t IndCallID) {
1584 GlobalIndCallCounters[IndCallID].incrementVal(Target, GlobalAlloc);
1585}
1586
1587/// We receive as in-stack arguments the identifier of the indirect call site
1588/// as well as the target address for the call
1589extern "C" __attribute((naked)) void __bolt_instr_indirect_call()
1590{
1591 __asm__ __volatile__(SAVE_ALL"push %%rax\n" "push %%rbx\n" "push %%rcx\n" "push %%rdx\n" "push %%rdi\n"
"push %%rsi\n" "push %%rbp\n" "push %%r8\n" "push %%r9\n" "push %%r10\n"
"push %%r11\n" "push %%r12\n" "push %%r13\n" "push %%r14\n" "push %%r15\n"
"sub $8, %%rsp\n"
1592 "mov 0xa0(%%rsp), %%rdi\n"
1593 "mov 0x98(%%rsp), %%rsi\n"
1594 "call instrumentIndirectCall\n"
1595 RESTORE_ALL"add $8, %%rsp\n" "pop %%r15\n" "pop %%r14\n" "pop %%r13\n" "pop %%r12\n"
"pop %%r11\n" "pop %%r10\n" "pop %%r9\n" "pop %%r8\n" "pop %%rbp\n"
"pop %%rsi\n" "pop %%rdi\n" "pop %%rdx\n" "pop %%rcx\n" "pop %%rbx\n"
"pop %%rax\n"
1596 "ret\n"
1597 :::);
1598}
1599
1600extern "C" __attribute((naked)) void __bolt_instr_indirect_tailcall()
1601{
1602 __asm__ __volatile__(SAVE_ALL"push %%rax\n" "push %%rbx\n" "push %%rcx\n" "push %%rdx\n" "push %%rdi\n"
"push %%rsi\n" "push %%rbp\n" "push %%r8\n" "push %%r9\n" "push %%r10\n"
"push %%r11\n" "push %%r12\n" "push %%r13\n" "push %%r14\n" "push %%r15\n"
"sub $8, %%rsp\n"
1603 "mov 0x98(%%rsp), %%rdi\n"
1604 "mov 0x90(%%rsp), %%rsi\n"
1605 "call instrumentIndirectCall\n"
1606 RESTORE_ALL"add $8, %%rsp\n" "pop %%r15\n" "pop %%r14\n" "pop %%r13\n" "pop %%r12\n"
"pop %%r11\n" "pop %%r10\n" "pop %%r9\n" "pop %%r8\n" "pop %%rbp\n"
"pop %%rsi\n" "pop %%rdi\n" "pop %%rdx\n" "pop %%rcx\n" "pop %%rbx\n"
"pop %%rax\n"
1607 "ret\n"
1608 :::);
1609}
1610
1611/// This is hooking ELF's entry, it needs to save all machine state.
1612extern "C" __attribute((naked)) void __bolt_instr_start()
1613{
1614 __asm__ __volatile__(SAVE_ALL"push %%rax\n" "push %%rbx\n" "push %%rcx\n" "push %%rdx\n" "push %%rdi\n"
"push %%rsi\n" "push %%rbp\n" "push %%r8\n" "push %%r9\n" "push %%r10\n"
"push %%r11\n" "push %%r12\n" "push %%r13\n" "push %%r14\n" "push %%r15\n"
"sub $8, %%rsp\n"
1615 "call __bolt_instr_setup\n"
1616 RESTORE_ALL"add $8, %%rsp\n" "pop %%r15\n" "pop %%r14\n" "pop %%r13\n" "pop %%r12\n"
"pop %%r11\n" "pop %%r10\n" "pop %%r9\n" "pop %%r8\n" "pop %%rbp\n"
"pop %%rsi\n" "pop %%rdi\n" "pop %%rdx\n" "pop %%rcx\n" "pop %%rbx\n"
"pop %%rax\n"
1617 "jmp __bolt_start_trampoline\n"
1618 :::);
1619}
1620
1621/// This is hooking into ELF's DT_FINI
1622extern "C" void __bolt_instr_fini() {
1623 __bolt_fini_trampoline();
1624 if (__bolt_instr_sleep_time == 0)
1625 __bolt_instr_data_dump();
1626 DEBUG(report("Finished.\n")){};
1627}
1628
1629#endif
1630
1631#if defined(__APPLE__)
1632
1633extern "C" void __bolt_instr_data_dump() {
1634 ProfileWriterContext Ctx = readDescriptions();
1635
1636 int FD = 2;
1637 BumpPtrAllocator Alloc;
1638 const uint8_t *FuncDesc = Ctx.FuncDescriptions;
1639 uint32_t bolt_instr_num_funcs = _bolt_instr_num_funcs_getter();
1640
1641 for (int I = 0, E = bolt_instr_num_funcs; I < E; ++I) {
1642 FuncDesc = writeFunctionProfile(FD, Ctx, FuncDesc, Alloc);
1643 Alloc.clear();
1644 DEBUG(reportNumber("FuncDesc now: ", (uint64_t)FuncDesc, 16)){};
1645 }
1646 assert(FuncDesc == (void *)Ctx.Strings,
1647 "FuncDesc ptr must be equal to stringtable");
1648}
1649
1650// On OSX/iOS the final symbol name of an extern "C" function/variable contains
1651// one extra leading underscore: _bolt_instr_setup -> __bolt_instr_setup.
1652extern "C"
1653__attribute__((section("__TEXT,__setup")))
1654__attribute__((force_align_arg_pointer))
1655void _bolt_instr_setup() {
1656 __asm__ __volatile__(SAVE_ALL"push %%rax\n" "push %%rbx\n" "push %%rcx\n" "push %%rdx\n" "push %%rdi\n"
"push %%rsi\n" "push %%rbp\n" "push %%r8\n" "push %%r9\n" "push %%r10\n"
"push %%r11\n" "push %%r12\n" "push %%r13\n" "push %%r14\n" "push %%r15\n"
"sub $8, %%rsp\n"
:::);
1657
1658 report("Hello!\n");
1659
1660 __asm__ __volatile__(RESTORE_ALL"add $8, %%rsp\n" "pop %%r15\n" "pop %%r14\n" "pop %%r13\n" "pop %%r12\n"
"pop %%r11\n" "pop %%r10\n" "pop %%r9\n" "pop %%r8\n" "pop %%rbp\n"
"pop %%rsi\n" "pop %%rdi\n" "pop %%rdx\n" "pop %%rcx\n" "pop %%rbx\n"
"pop %%rax\n"
:::);
1661}
1662
1663extern "C"
1664__attribute__((section("__TEXT,__fini")))
1665__attribute__((force_align_arg_pointer))
1666void _bolt_instr_fini() {
1667 report("Bye!\n");
1668 __bolt_instr_data_dump();
1669}
1670
1671#endif
1672#endif