LLVM 19.0.0git
AMDGPUSplitModule.cpp
Go to the documentation of this file.
1//===- AMDGPUSplitModule.cpp ----------------------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file Implements a module splitting algorithm designed to support the
10/// FullLTO --lto-partitions option for parallel codegen. This is completely
11/// different from the common SplitModule pass, as this system is designed with
12/// AMDGPU in mind.
13///
14/// The basic idea of this module splitting implementation is the same as
15/// SplitModule: load-balance the module's functions across a set of N
16/// partitions to allow parallel codegen. However, it does it very
17/// differently than the target-agnostic variant:
18/// - Kernels are used as the module's "roots".
19/// They're known entry points on AMDGPU, and everything else is often
20/// internal only.
21/// - Each kernel has a set of dependencies, and when a kernel and its
22/// dependencies is considered "big", we try to put it in a partition where
23/// most dependencies are already imported, to avoid duplicating large
24/// amounts of code.
25/// - There's special care for indirect calls in order to ensure
26/// AMDGPUResourceUsageAnalysis can work correctly.
27///
28/// This file also includes a more elaborate logging system to enable
29/// users to easily generate logs that (if desired) do not include any value
30/// names, in order to not leak information about the source file.
31/// Such logs are very helpful to understand and fix potential issues with
32/// module splitting.
33
34#include "AMDGPUSplitModule.h"
35#include "AMDGPUTargetMachine.h"
37#include "llvm/ADT/DenseMap.h"
40#include "llvm/ADT/StringRef.h"
43#include "llvm/IR/Function.h"
44#include "llvm/IR/Instruction.h"
45#include "llvm/IR/Module.h"
46#include "llvm/IR/User.h"
47#include "llvm/IR/Value.h"
49#include "llvm/Support/Debug.h"
51#include "llvm/Support/Path.h"
53#include "llvm/Support/SHA256.h"
57#include <algorithm>
58#include <cassert>
59#include <iterator>
60#include <memory>
61#include <utility>
62#include <vector>
63
64using namespace llvm;
65
66#define DEBUG_TYPE "amdgpu-split-module"
67
68namespace {
69
70static cl::opt<float> LargeKernelFactor(
71 "amdgpu-module-splitting-large-kernel-threshold", cl::init(2.0f),
74 "consider a kernel as large and needing special treatment when it "
75 "exceeds the average cost of a partition by this factor; e;g. 2.0 "
76 "means if the kernel and its dependencies is 2 times bigger than "
77 "an average partition; 0 disables large kernels handling entirely"));
78
79static cl::opt<float> LargeKernelOverlapForMerge(
80 "amdgpu-module-splitting-large-kernel-merge-overlap", cl::init(0.8f),
82 cl::desc("defines how much overlap between two large kernel's dependencies "
83 "is needed to put them in the same partition"));
84
85static cl::opt<bool> NoExternalizeGlobals(
86 "amdgpu-module-splitting-no-externalize-globals", cl::Hidden,
87 cl::desc("disables externalization of global variable with local linkage; "
88 "may cause globals to be duplicated which increases binary size"));
89
91 LogDirOpt("amdgpu-module-splitting-log-dir", cl::Hidden,
92 cl::desc("output directory for AMDGPU module splitting logs"));
93
94static cl::opt<bool>
95 LogPrivate("amdgpu-module-splitting-log-private", cl::Hidden,
96 cl::desc("hash value names before printing them in the AMDGPU "
97 "module splitting logs"));
98
99using CostType = InstructionCost::CostType;
100using PartitionID = unsigned;
101
102static bool isEntryPoint(const Function *F) {
103 return AMDGPU::isEntryFunctionCC(F->getCallingConv());
104}
105
106static std::string getName(const Value &V) {
107 static bool HideNames;
108
109 static llvm::once_flag HideNameInitFlag;
110 llvm::call_once(HideNameInitFlag, [&]() {
111 if (LogPrivate.getNumOccurrences())
112 HideNames = LogPrivate;
113 else {
114 const auto EV = sys::Process::GetEnv("AMD_SPLIT_MODULE_LOG_PRIVATE");
115 HideNames = (EV.value_or("0") != "0");
116 }
117 });
118
119 if (!HideNames)
120 return V.getName().str();
121 return toHex(SHA256::hash(arrayRefFromStringRef(V.getName())),
122 /*LowerCase=*/true);
123}
124
125/// Main logging helper.
126///
127/// Logging can be configured by the following environment variable.
128/// AMD_SPLIT_MODULE_LOG_DIR=<filepath>
129/// If set, uses <filepath> as the directory to write logfiles to
130/// each time module splitting is used.
131/// AMD_SPLIT_MODULE_LOG_PRIVATE
132/// If set to anything other than zero, all names are hidden.
133///
134/// Both environment variables have corresponding CL options which
135/// takes priority over them.
136///
137/// Any output printed to the log files is also printed to dbgs() when -debug is
138/// used and LLVM_DEBUG is defined.
139///
140/// This approach has a small disadvantage over LLVM_DEBUG though: logging logic
141/// cannot be removed from the code (by building without debug). This probably
142/// has a small performance cost because if some computation/formatting is
143/// needed for logging purpose, it may be done everytime only to be ignored
144/// by the logger.
145///
146/// As this pass only runs once and is not doing anything computationally
147/// expensive, this is likely a reasonable trade-off.
148///
149/// If some computation should really be avoided when unused, users of the class
150/// can check whether any logging will occur by using the bool operator.
151///
152/// \code
153/// if (SML) {
154/// // Executes only if logging to a file or if -debug is available and
155/// used.
156/// }
157/// \endcode
158class SplitModuleLogger {
159public:
160 SplitModuleLogger(const Module &M) {
161 std::string LogDir = LogDirOpt;
162 if (LogDir.empty())
163 LogDir = sys::Process::GetEnv("AMD_SPLIT_MODULE_LOG_DIR").value_or("");
164
165 // No log dir specified means we don't need to log to a file.
166 // We may still log to dbgs(), though.
167 if (LogDir.empty())
168 return;
169
170 // If a log directory is specified, create a new file with a unique name in
171 // that directory.
172 int Fd;
173 SmallString<0> PathTemplate;
174 SmallString<0> RealPath;
175 sys::path::append(PathTemplate, LogDir, "Module-%%-%%-%%-%%-%%-%%-%%.txt");
176 if (auto Err =
177 sys::fs::createUniqueFile(PathTemplate.str(), Fd, RealPath)) {
178 report_fatal_error("Failed to create log file at '" + Twine(LogDir) +
179 "': " + Err.message(),
180 /*CrashDiag=*/false);
181 }
182
183 FileOS = std::make_unique<raw_fd_ostream>(Fd, /*shouldClose=*/true);
184 }
185
186 bool hasLogFile() const { return FileOS != nullptr; }
187
188 raw_ostream &logfile() {
189 assert(FileOS && "no logfile!");
190 return *FileOS;
191 }
192
193 /// \returns true if this SML will log anything either to a file or dbgs().
194 /// Can be used to avoid expensive computations that are ignored when logging
195 /// is disabled.
196 operator bool() const {
197 return hasLogFile() || (DebugFlag && isCurrentDebugType(DEBUG_TYPE));
198 }
199
200private:
201 std::unique_ptr<raw_fd_ostream> FileOS;
202};
203
204template <typename Ty>
205static SplitModuleLogger &operator<<(SplitModuleLogger &SML, const Ty &Val) {
206 static_assert(
207 !std::is_same_v<Ty, Value>,
208 "do not print values to logs directly, use handleName instead!");
209 LLVM_DEBUG(dbgs() << Val);
210 if (SML.hasLogFile())
211 SML.logfile() << Val;
212 return SML;
213}
214
215/// Calculate the cost of each function in \p M
216/// \param SML Log Helper
217/// \param TM TargetMachine instance used to retrieve TargetTransformInfo.
218/// \param M Module to analyze.
219/// \param CostMap[out] Resulting Function -> Cost map.
220/// \return The module's total cost.
221static CostType
222calculateFunctionCosts(SplitModuleLogger &SML, const AMDGPUTargetMachine &TM,
223 Module &M,
225 CostType ModuleCost = 0;
226 CostType KernelCost = 0;
227
228 for (auto &Fn : M) {
229 if (Fn.isDeclaration())
230 continue;
231
232 CostType FnCost = 0;
233 TargetTransformInfo TTI = TM.getTargetTransformInfo(Fn);
234
235 for (const auto &BB : Fn) {
236 for (const auto &I : BB) {
237 auto Cost =
240 // Assume expensive if we can't tell the cost of an instruction.
241 CostType CostVal =
243 assert((FnCost + CostVal) >= FnCost && "Overflow!");
244 FnCost += CostVal;
245 }
246 }
247
248 assert(FnCost != 0);
249
250 CostMap[&Fn] = FnCost;
251 assert((ModuleCost + FnCost) >= ModuleCost && "Overflow!");
252 ModuleCost += FnCost;
253
254 if (isEntryPoint(&Fn))
255 KernelCost += FnCost;
256 }
257
258 CostType FnCost = (ModuleCost - KernelCost);
259 SML << "=> Total Module Cost: " << ModuleCost << '\n'
260 << " => KernelCost: " << KernelCost << " ("
261 << format("%0.2f", (float(KernelCost) / ModuleCost) * 100) << "%)\n"
262 << " => FnsCost: " << FnCost << " ("
263 << format("%0.2f", (float(FnCost) / ModuleCost) * 100) << "%)\n";
264
265 return ModuleCost;
266}
267
268static bool canBeIndirectlyCalled(const Function &F) {
269 if (F.isDeclaration() || isEntryPoint(&F))
270 return false;
271 return !F.hasLocalLinkage() ||
272 F.hasAddressTaken(/*PutOffender=*/nullptr,
273 /*IgnoreCallbackUses=*/false,
274 /*IgnoreAssumeLikeCalls=*/true,
275 /*IgnoreLLVMUsed=*/true,
276 /*IgnoreARCAttachedCall=*/false,
277 /*IgnoreCastedDirectCall=*/true);
278}
279
280/// When a kernel or any of its callees performs an indirect call, this function
281/// takes over \ref addAllDependencies and adds all potentially callable
282/// functions to \p Fns so they can be counted as dependencies of the kernel.
283///
284/// This is needed due to how AMDGPUResourceUsageAnalysis operates: in the
285/// presence of an indirect call, the function's resource usage is the same as
286/// the most expensive function in the module.
287/// \param M The module.
288/// \param Fns[out] Resulting list of functions.
289static void addAllIndirectCallDependencies(const Module &M,
291 for (const auto &Fn : M) {
292 if (canBeIndirectlyCalled(Fn))
293 Fns.insert(&Fn);
294 }
295}
296
297/// Adds the functions that \p Fn may call to \p Fns, then recurses into each
298/// callee until all reachable functions have been gathered.
299///
300/// \param SML Log Helper
301/// \param CG Call graph for \p Fn's module.
302/// \param Fn Current function to look at.
303/// \param Fns[out] Resulting list of functions.
304/// \param HadIndirectCall[out] Set to true if an indirect call was seen at some
305/// point, either in \p Fn or in one of the function it calls. When that
306/// happens, we fall back to adding all callable functions inside \p Fn's module
307/// to \p Fns.
308static void addAllDependencies(SplitModuleLogger &SML, const CallGraph &CG,
309 const Function &Fn,
311 bool &HadIndirectCall) {
312 assert(!Fn.isDeclaration());
313
314 const Module &M = *Fn.getParent();
315 SmallVector<const Function *> WorkList({&Fn});
316 while (!WorkList.empty()) {
317 const auto &CurFn = *WorkList.pop_back_val();
318 assert(!CurFn.isDeclaration());
319
320 // Scan for an indirect call. If such a call is found, we have to
321 // conservatively assume this can call all non-entrypoint functions in the
322 // module.
323
324 for (auto &CGEntry : *CG[&CurFn]) {
325 auto *CGNode = CGEntry.second;
326 auto *Callee = CGNode->getFunction();
327 if (!Callee) {
328 // Functions have an edge towards CallsExternalNode if they're external
329 // declarations, or if they do an indirect call. As we only process
330 // definitions here, we know this means the function has an indirect
331 // call. We then have to conservatively assume this can call all
332 // non-entrypoint functions in the module.
333 if (CGNode != CG.getCallsExternalNode())
334 continue; // this is another function-less node we don't care about.
335
336 SML << "Indirect call detected in " << getName(CurFn)
337 << " - treating all non-entrypoint functions as "
338 "potential dependencies\n";
339
340 // TODO: Print an ORE as well ?
341 addAllIndirectCallDependencies(M, Fns);
342 HadIndirectCall = true;
343 continue;
344 }
345
346 if (Callee->isDeclaration())
347 continue;
348
349 auto [It, Inserted] = Fns.insert(Callee);
350 if (Inserted)
351 WorkList.push_back(Callee);
352 }
353 }
354}
355
356/// Contains information about a kernel and its dependencies.
357struct KernelWithDependencies {
358 KernelWithDependencies(SplitModuleLogger &SML, CallGraph &CG,
360 const Function *Fn)
361 : Fn(Fn) {
362 addAllDependencies(SML, CG, *Fn, Dependencies, HasIndirectCall);
363 TotalCost = FnCosts.at(Fn);
364 for (const auto *Dep : Dependencies) {
365 TotalCost += FnCosts.at(Dep);
366
367 // We cannot duplicate functions with external linkage, or functions that
368 // may be overriden at runtime.
369 HasNonDuplicatableDependecy |=
370 (Dep->hasExternalLinkage() || !Dep->isDefinitionExact());
371 }
372 }
373
374 const Function *Fn = nullptr;
375 DenseSet<const Function *> Dependencies;
376 /// Whether \p Fn or any of its \ref Dependencies contains an indirect call.
377 bool HasIndirectCall = false;
378 /// Whether any of \p Fn's dependencies cannot be duplicated.
379 bool HasNonDuplicatableDependecy = false;
380
381 CostType TotalCost = 0;
382
383 /// \returns true if this kernel and its dependencies can be considered large
384 /// according to \p Threshold.
385 bool isLarge(CostType Threshold) const {
386 return TotalCost > Threshold && !Dependencies.empty();
387 }
388};
389
390/// Calculates how much overlap there is between \p A and \p B.
391/// \return A number between 0.0 and 1.0, where 1.0 means A == B and 0.0 means A
392/// and B have no shared elements. Kernels do not count in overlap calculation.
393static float calculateOverlap(const DenseSet<const Function *> &A,
396 for (const auto *F : A) {
397 if (!isEntryPoint(F))
398 Total.insert(F);
399 }
400
401 if (Total.empty())
402 return 0.0f;
403
404 unsigned NumCommon = 0;
405 for (const auto *F : B) {
406 if (isEntryPoint(F))
407 continue;
408
409 auto [It, Inserted] = Total.insert(F);
410 if (!Inserted)
411 ++NumCommon;
412 }
413
414 return static_cast<float>(NumCommon) / Total.size();
415}
416
417/// Performs all of the partitioning work on \p M.
418/// \param SML Log Helper
419/// \param M Module to partition.
420/// \param NumParts Number of partitions to create.
421/// \param ModuleCost Total cost of all functions in \p M.
422/// \param FnCosts Map of Function -> Cost
423/// \param WorkList Kernels and their dependencies to process in order.
424/// \returns The created partitions (a vector of size \p NumParts )
425static std::vector<DenseSet<const Function *>>
426doPartitioning(SplitModuleLogger &SML, Module &M, unsigned NumParts,
427 CostType ModuleCost,
429 const SmallVector<KernelWithDependencies> &WorkList) {
430
431 SML << "\n--Partitioning Starts--\n";
432
433 // Calculate a "large kernel threshold". When more than one kernel's total
434 // import cost exceeds this value, we will try to merge it with other,
435 // similarly large kernels.
436 //
437 // e.g. let two kernels X and Y have a import cost of ~10% of the module, we
438 // assign X to a partition as usual, but when we get to Y, we check if it's
439 // worth also putting it in Y's partition.
440 const CostType LargeKernelThreshold =
441 LargeKernelFactor ? CostType(((ModuleCost / NumParts) * LargeKernelFactor))
442 : std::numeric_limits<CostType>::max();
443
444 std::vector<DenseSet<const Function *>> Partitions;
445 Partitions.resize(NumParts);
446
447 // Assign a partition to each kernel, and try to keep the partitions more or
448 // less balanced. We do that through a priority queue sorted in reverse, so we
449 // can always look at the partition with the least content.
450 //
451 // There are some cases where we will be deliberately unbalanced though.
452 // - Large kernels: we try to merge with existing partitions to reduce code
453 // duplication.
454 // - Kernels with indirect or external calls always go in the first partition
455 // (P0).
456 auto ComparePartitions = [](const std::pair<PartitionID, CostType> &a,
457 const std::pair<PartitionID, CostType> &b) {
458 // When two partitions have the same cost, assign to the one with the
459 // biggest ID first. This allows us to put things in P0 last, because P0 may
460 // have other stuff added later.
461 if (a.second == b.second)
462 return a.first < b.first;
463 return a.second > b.second;
464 };
465
466 // We can't use priority_queue here because we need to be able to access any
467 // element. This makes this a bit inefficient as we need to sort it again
468 // everytime we change it, but it's a very small array anyway (likely under 64
469 // partitions) so it's a cheap operation.
470 std::vector<std::pair<PartitionID, CostType>> BalancingQueue;
471 for (unsigned I = 0; I < NumParts; ++I)
472 BalancingQueue.push_back(std::make_pair(I, 0));
473
474 // Helper function to handle assigning a kernel to a partition. This takes
475 // care of updating the balancing queue.
476 const auto AssignToPartition = [&](PartitionID PID,
477 const KernelWithDependencies &KWD) {
478 auto &FnsInPart = Partitions[PID];
479 FnsInPart.insert(KWD.Fn);
480 FnsInPart.insert(KWD.Dependencies.begin(), KWD.Dependencies.end());
481
482 SML << "assign " << getName(*KWD.Fn) << " to P" << PID << "\n -> ";
483 if (!KWD.Dependencies.empty()) {
484 SML << KWD.Dependencies.size() << " dependencies added\n";
485 };
486
487 // Update the balancing queue. we scan backwards because in the common case
488 // the partition is at the end.
489 for (auto &[QueuePID, Cost] : reverse(BalancingQueue)) {
490 if (QueuePID == PID) {
491 CostType NewCost = 0;
492 for (auto *Fn : Partitions[PID])
493 NewCost += FnCosts.at(Fn);
494
495 SML << "[Updating P" << PID << " Cost]:" << Cost << " -> " << NewCost;
496 if (Cost) {
497 SML << " (" << unsigned(((float(NewCost) / Cost) - 1) * 100)
498 << "% increase)";
499 }
500 SML << '\n';
501
502 Cost = NewCost;
503 }
504 }
505
506 sort(BalancingQueue, ComparePartitions);
507 };
508
509 for (auto &CurKernel : WorkList) {
510 // When a kernel has indirect calls, it must stay in the first partition
511 // alongside every reachable non-entry function. This is a nightmare case
512 // for splitting as it severely limits what we can do.
513 if (CurKernel.HasIndirectCall) {
514 SML << "Kernel with indirect call(s): " << getName(*CurKernel.Fn)
515 << " defaulting to P0\n";
516 AssignToPartition(0, CurKernel);
517 continue;
518 }
519
520 // When a kernel has non duplicatable dependencies, we have to keep it in
521 // the first partition as well. This is a conservative approach, a
522 // finer-grained approach could keep track of which dependencies are
523 // non-duplicatable exactly and just make sure they're grouped together.
524 if (CurKernel.HasNonDuplicatableDependecy) {
525 SML << "Kernel with externally visible dependency "
526 << getName(*CurKernel.Fn) << " defaulting to P0\n";
527 AssignToPartition(0, CurKernel);
528 continue;
529 }
530
531 // Be smart with large kernels to avoid duplicating their dependencies.
532 if (CurKernel.isLarge(LargeKernelThreshold)) {
533 assert(LargeKernelOverlapForMerge >= 0.0f &&
534 LargeKernelOverlapForMerge <= 1.0f);
535 SML << "Large Kernel: " << getName(*CurKernel.Fn)
536 << " - looking for partition with at least "
537 << format("%0.2f", LargeKernelOverlapForMerge * 100) << "% overlap\n";
538
539 bool Assigned = false;
540 for (const auto &[PID, Fns] : enumerate(Partitions)) {
541 float Overlap = calculateOverlap(CurKernel.Dependencies, Fns);
542 SML << " => " << format("%0.2f", Overlap * 100) << "% overlap with P"
543 << PID << '\n';
544 if (Overlap > LargeKernelOverlapForMerge) {
545 SML << " selecting P" << PID << '\n';
546 AssignToPartition(PID, CurKernel);
547 Assigned = true;
548 }
549 }
550
551 if (Assigned)
552 continue;
553 }
554
555 // Normal "load-balancing", assign to partition with least pressure.
556 auto [PID, CurCost] = BalancingQueue.back();
557 AssignToPartition(PID, CurKernel);
558 }
559
560 // Work is mostly done now, verify the partioning and add all functions we may
561 // have missed (= unreachable, or we don't understand how they're reached) to
562 // P0.
563 DenseSet<const Function *> AllFunctions;
564 for (const auto &[Idx, Part] : enumerate(Partitions)) {
565 CostType Cost = 0;
566 for (auto *Fn : Part) {
567 // external linkage functions should exclusively be in the first partition
568 // at this stage. In theory, we should only ever see external linkage
569 // functions here if they're kernels, or if they've been added due to a
570 // kernel using indirect calls somewhere in its CallGraph.
571 assert(Idx == 0 || (!Fn->hasExternalLinkage() || isEntryPoint(Fn)));
572 Cost += FnCosts.at(Fn);
573 }
574 SML << "P" << Idx << " has a total cost of " << Cost << " ("
575 << format("%0.2f", (float(Cost) / ModuleCost) * 100)
576 << "% of source module)\n";
577 AllFunctions.insert(Part.begin(), Part.end());
578 }
579
580 // Add missed functions to P0. This will take care of adding things like
581 // external functions with no callers in the module to P0. This should be
582 // fairly rare as AMDGPU internalizes everything in most cases, so unused
583 // internal functions would get removed.
584 for (auto &Fn : M) {
585 if (!Fn.isDeclaration() && !AllFunctions.contains(&Fn)) {
586 SML << getName(Fn) << " has no partition assigned, defaulting to P0\n";
587 Partitions[0].insert(&Fn);
588 }
589 }
590
591 SML << "--Partitioning Done--\n\n";
592
593 return Partitions;
594}
595
596static void externalize(GlobalValue &GV) {
597 if (GV.hasLocalLinkage()) {
600 }
601
602 // Unnamed entities must be named consistently between modules. setName will
603 // give a distinct name to each such entity.
604 if (!GV.hasName())
605 GV.setName("__llvmsplit_unnamed");
606}
607} // end anonymous namespace
608
610 const AMDGPUTargetMachine &TM, Module &M, unsigned N,
611 function_ref<void(std::unique_ptr<Module> MPart)> ModuleCallback) {
612
613 SplitModuleLogger SML(M);
614
615 CallGraph CG(M);
616
617 // Externalize functions whose address are taken.
618 //
619 // This is needed because partitioning is purely based on calls, but sometimes
620 // a kernel/function may just look at the address of another local function
621 // and not do anything (no calls). After partitioning, that local function may
622 // end up in a different module (so it's just a declaration in the module
623 // where its address is taken), which emits a "undefined hidden symbol" linker
624 // error.
625 //
626 // Additionally, it guides partitioning to not duplicate this function if it's
627 // called directly at some point.
628 for (auto &Fn : M) {
629 if (Fn.hasAddressTaken()) {
630 if (Fn.hasLocalLinkage()) {
631 SML << "[externalize] " << Fn.getName()
632 << " because its address is taken\n";
633 }
634 externalize(Fn);
635 }
636 }
637
638 // Externalize local GVs, which avoids duplicating their initializers, which
639 // in turns helps keep code size in check.
640 if (!NoExternalizeGlobals) {
641 for (auto &GV : M.globals()) {
642 if (GV.hasLocalLinkage())
643 SML << "[externalize] GV " << GV.getName() << '\n';
644 externalize(GV);
645 }
646 }
647
648 // Start by calculating the cost of every function in the module, as well as
649 // the module's overall cost.
651 const CostType ModuleCost = calculateFunctionCosts(SML, TM, M, FnCosts);
652
653 // Gather every kernel into a WorkList, then sort it by descending total cost
654 // of the kernel so the biggest kernels are seen first.
656 for (auto &Fn : M) {
657 if (isEntryPoint(&Fn) && !Fn.isDeclaration())
658 WorkList.emplace_back(SML, CG, FnCosts, &Fn);
659 }
660 sort(WorkList, [&](auto &A, auto &B) {
661 // Sort by total cost, and if the total cost is identical, sort
662 // alphabetically.
663 if (A.TotalCost == B.TotalCost)
664 return A.Fn->getName() < B.Fn->getName();
665 return A.TotalCost > B.TotalCost;
666 });
667
668 if (SML) {
669 SML << "Worklist\n";
670 for (const auto &KWD : WorkList) {
671 SML << "[Kernel] " << getName(*KWD.Fn) << " (totalCost:" << KWD.TotalCost
672 << " indirect:" << KWD.HasIndirectCall
673 << " hasNonDuplicatableDep:" << KWD.HasNonDuplicatableDependecy
674 << ")\n";
675 for (const auto *Dep : KWD.Dependencies)
676 SML << " [Dep] " << getName(*Dep) << '\n';
677 }
678 }
679
680 // This performs all of the partitioning work.
681 auto Partitions = doPartitioning(SML, M, N, ModuleCost, FnCosts, WorkList);
682 assert(Partitions.size() == N);
683
684 // If we didn't externalize GVs, then local GVs need to be conservatively
685 // imported into every module (including their initializers), and then cleaned
686 // up afterwards.
687 const auto NeedsConservativeImport = [&](const GlobalValue *GV) {
688 // We conservatively import private/internal GVs into every module and clean
689 // them up afterwards.
690 const auto *Var = dyn_cast<GlobalVariable>(GV);
691 return Var && Var->hasLocalLinkage();
692 };
693
694 SML << "Creating " << N << " modules...\n";
695 unsigned TotalFnImpls = 0;
696 for (unsigned I = 0; I < N; ++I) {
697 const auto &FnsInPart = Partitions[I];
698
700 std::unique_ptr<Module> MPart(
701 CloneModule(M, VMap, [&](const GlobalValue *GV) {
702 // Functions go in their assigned partition.
703 if (const auto *Fn = dyn_cast<Function>(GV)) {
704// Check we don't import an external linkage function in any
705// partition other than P0.
706#ifndef NDEBUG
707 if (Fn->hasExternalLinkage() && !isEntryPoint(Fn)) {
708 assert((I == 0) == FnsInPart.contains(Fn));
709 }
710#endif
711 return FnsInPart.contains(Fn);
712 }
713
714 if (NeedsConservativeImport(GV))
715 return true;
716
717 // Everything else goes in the first partition.
718 return I == 0;
719 }));
720
721 // Clean-up conservatively imported GVs without any users.
722 for (auto &GV : make_early_inc_range(MPart->globals())) {
723 if (NeedsConservativeImport(&GV) && GV.use_empty())
724 GV.eraseFromParent();
725 }
726
727 unsigned NumAllFns = 0, NumKernels = 0;
728 for (auto &Cur : *MPart) {
729 if (!Cur.isDeclaration()) {
730 ++NumAllFns;
731 if (isEntryPoint(&Cur))
732 ++NumKernels;
733 }
734 }
735 TotalFnImpls += NumAllFns;
736 SML << " - Module " << I << " with " << NumAllFns << " functions ("
737 << NumKernels << " kernels)\n";
738 ModuleCallback(std::move(MPart));
739 }
740
741 SML << TotalFnImpls << " function definitions across all modules ("
742 << format("%0.2f", (float(TotalFnImpls) / FnCosts.size()) * 100)
743 << "% of original module)\n";
744}
#define DEBUG_TYPE
The AMDGPU TargetMachine interface definition for hw codegen targets.
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
This file provides interfaces used to build and manipulate a call graph, which is a very useful tool ...
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
#define LLVM_DEBUG(X)
Definition: Debug.h:101
This file defines the DenseMap class.
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
Module.h This file contains the declarations for the Module class.
const char LLVMTargetMachineRef TM
Provides a library for accessing information about this process and other processes on the operating ...
static StringRef getName(Value *V)
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
This file defines the SmallVector class.
static void externalize(GlobalValue *GV)
This file contains some functions that are useful when dealing with strings.
This pass exposes codegen information to IR-level passes.
The basic data container for the call graph of a Module of IR.
Definition: CallGraph.h:72
CallGraphNode * getCallsExternalNode() const
Definition: CallGraph.h:129
unsigned size() const
Definition: DenseMap.h:99
const ValueT & at(const_arg_type_t< KeyT > Val) const
at - Return the entry for the specified key, or abort if no such entry exists.
Definition: DenseMap.h:211
Implements a dense probed hash-table based set.
Definition: DenseSet.h:271
bool hasExternalLinkage() const
Definition: GlobalValue.h:510
bool isDeclaration() const
Return true if the primary definition of this global value is outside of the current translation unit...
Definition: Globals.cpp:286
bool hasLocalLinkage() const
Definition: GlobalValue.h:527
void setLinkage(LinkageTypes LT)
Definition: GlobalValue.h:536
Module * getParent()
Get the module that this global value is contained inside of...
Definition: GlobalValue.h:655
void eraseFromParent()
This method unlinks 'this' from the containing module and deletes it.
Definition: Globals.cpp:91
@ HiddenVisibility
The GV is hidden.
Definition: GlobalValue.h:67
void setVisibility(VisibilityTypes V)
Definition: GlobalValue.h:253
@ ExternalLinkage
Externally visible function.
Definition: GlobalValue.h:51
static InstructionCost getMax()
std::optional< CostType > getValue() const
This function is intended to be used as sparingly as possible, since the class provides the full rang...
A Module instance is used to store all the information related to an LLVM module.
Definition: Module.h:65
static std::array< uint8_t, 32 > hash(ArrayRef< uint8_t > Data)
Returns a raw 256-bit SHA256 hash for the given data.
Definition: SHA256.cpp:280
SmallString - A SmallString is just a SmallVector with methods and accessors that make it work better...
Definition: SmallString.h:26
StringRef str() const
Explicit conversion to StringRef.
Definition: SmallString.h:254
reference emplace_back(ArgTypes &&... Args)
Definition: SmallVector.h:950
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1209
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
@ TCK_CodeSize
Instruction code size.
@ TCC_Expensive
The cost of a 'div' instruction on x86.
InstructionCost getInstructionCost(const User *U, ArrayRef< const Value * > Operands, TargetCostKind CostKind) const
Estimate the cost of a given IR user when lowered.
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition: Twine.h:81
LLVM Value Representation.
Definition: Value.h:74
void setName(const Twine &Name)
Change the name of the value.
Definition: Value.cpp:377
bool use_empty() const
Definition: Value.h:344
bool hasName() const
Definition: Value.h:261
int getNumOccurrences() const
Definition: CommandLine.h:406
std::pair< iterator, bool > insert(const ValueT &V)
Definition: DenseSet.h:206
bool contains(const_arg_type_t< ValueT > V) const
Check if the set contains the given element.
Definition: DenseSet.h:185
An efficient, type-erasing, non-owning reference to a callable.
This class implements an extremely fast bulk output stream that can only output to a stream.
Definition: raw_ostream.h:52
static std::optional< std::string > GetEnv(StringRef name)
bool isEntryFunctionCC(CallingConv::ID CC)
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:450
std::error_code createUniqueFile(const Twine &Model, int &ResultFD, SmallVectorImpl< char > &ResultPath, OpenFlags Flags=OF_None, unsigned Mode=all_read|all_write)
Create a uniquely named file.
Definition: Path.cpp:823
void append(SmallVectorImpl< char > &path, const Twine &a, const Twine &b="", const Twine &c="", const Twine &d="")
Append to path.
Definition: Path.cpp:457
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are are tuples (A,...
Definition: STLExtras.h:2400
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition: STLExtras.h:656
bool DebugFlag
This boolean is set to true if the '-debug' command line option is specified.
Definition: Debug.cpp:45
auto reverse(ContainerTy &&C)
Definition: STLExtras.h:419
void sort(IteratorTy Start, IteratorTy End)
Definition: STLExtras.h:1647
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
void report_fatal_error(Error Err, bool gen_crash_diag=true)
Report a serious error, calling any installed error handler.
Definition: Error.cpp:159
format_object< Ts... > format(const char *Fmt, const Ts &... Vals)
These are helper functions used to produce formatted output.
Definition: Format.h:125
bool isEntryPoint(const Function &F)
Definition: SPIRVUtils.cpp:386
bool isCurrentDebugType(const char *Type)
isCurrentDebugType - Return true if the specified string is the debug type specified on the command l...
Definition: Debug.cpp:52
raw_ostream & operator<<(raw_ostream &OS, const APFixedPoint &FX)
Definition: APFixedPoint.h:293
void call_once(once_flag &flag, Function &&F, Args &&... ArgList)
Execute the function specified as a parameter once.
Definition: Threading.h:87
void splitAMDGPUModule(const AMDGPUTargetMachine &TM, Module &M, unsigned N, function_ref< void(std::unique_ptr< Module > MPart)> ModuleCallback)
Splits the module M into N linkable partitions.
std::unique_ptr< Module > CloneModule(const Module &M)
Return an exact copy of the specified module.
Definition: CloneModule.cpp:39
#define N
The llvm::once_flag structure.
Definition: Threading.h:68