LLVM 23.0.0git
AMDGPUTargetMachine.cpp
Go to the documentation of this file.
1//===-- AMDGPUTargetMachine.cpp - TargetMachine for hw codegen targets-----===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// This file contains both AMDGPU target machine and the CodeGen pass builder.
11/// The AMDGPU target machine contains all of the hardware specific information
12/// needed to emit code for SI+ GPUs in the legacy pass manager pipeline. The
13/// CodeGen pass builder handles the pass pipeline for new pass manager.
14//
15//===----------------------------------------------------------------------===//
16
17#include "AMDGPUTargetMachine.h"
18#include "AMDGPU.h"
19#include "AMDGPUAliasAnalysis.h"
25#include "AMDGPUHazardLatency.h"
26#include "AMDGPUIGroupLP.h"
27#include "AMDGPUISelDAGToDAG.h"
29#include "AMDGPUMacroFusion.h"
36#include "AMDGPUSplitModule.h"
41#include "GCNDPPCombine.h"
43#include "GCNNSAReassign.h"
47#include "GCNSchedStrategy.h"
48#include "GCNVOPDUtils.h"
49#include "R600.h"
50#include "R600TargetMachine.h"
51#include "SIFixSGPRCopies.h"
52#include "SIFixVGPRCopies.h"
53#include "SIFoldOperands.h"
54#include "SIFormMemoryClauses.h"
56#include "SILowerControlFlow.h"
57#include "SILowerSGPRSpills.h"
58#include "SILowerWWMCopies.h"
60#include "SIMachineScheduler.h"
64#include "SIPeepholeSDWA.h"
65#include "SIPostRABundler.h"
68#include "SIWholeQuadMode.h"
89#include "llvm/CodeGen/Passes.h"
94#include "llvm/IR/IntrinsicsAMDGPU.h"
95#include "llvm/IR/PassManager.h"
104#include "llvm/Transforms/IPO.h"
129#include <optional>
130
131using namespace llvm;
132using namespace llvm::PatternMatch;
133
134namespace {
135//===----------------------------------------------------------------------===//
136// AMDGPU CodeGen Pass Builder interface.
137//===----------------------------------------------------------------------===//
138
139class AMDGPUCodeGenPassBuilder
140 : public CodeGenPassBuilder<AMDGPUCodeGenPassBuilder, GCNTargetMachine> {
141 using Base = CodeGenPassBuilder<AMDGPUCodeGenPassBuilder, GCNTargetMachine>;
142
143public:
144 AMDGPUCodeGenPassBuilder(GCNTargetMachine &TM,
145 const CGPassBuilderOption &Opts,
146 PassInstrumentationCallbacks *PIC);
147
148 void addIRPasses(PassManagerWrapper &PMW) const;
149 void addCodeGenPrepare(PassManagerWrapper &PMW) const;
150 void addPreISel(PassManagerWrapper &PMW) const;
151 void addILPOpts(PassManagerWrapper &PMWM) const;
152 void addAsmPrinterBegin(PassManagerWrapper &PMW) const;
153 void addAsmPrinter(PassManagerWrapper &PMW) const;
154 void addAsmPrinterEnd(PassManagerWrapper &PMW) const;
155 Error addInstSelector(PassManagerWrapper &PMW) const;
156 void addPreRewrite(PassManagerWrapper &PMW) const;
157 void addMachineSSAOptimization(PassManagerWrapper &PMW) const;
158 void addPostRegAlloc(PassManagerWrapper &PMW) const;
159 void addPreEmitPass(PassManagerWrapper &PMWM) const;
160 void addPreEmitRegAlloc(PassManagerWrapper &PMW) const;
161 Error addRegAssignmentFast(PassManagerWrapper &PMW) const;
162 Error addRegAssignmentOptimized(PassManagerWrapper &PMW) const;
163 void addPreRegAlloc(PassManagerWrapper &PMW) const;
164 Error addFastRegAlloc(PassManagerWrapper &PMW) const;
165 Error addOptimizedRegAlloc(PassManagerWrapper &PMW) const;
166 void addPreSched2(PassManagerWrapper &PMW) const;
167 void addPostBBSections(PassManagerWrapper &PMW) const;
168
169private:
170 Error validateRegAllocOptions() const;
171
172public:
173 /// Check if a pass is enabled given \p Opt option. The option always
174 /// overrides defaults if explicitly used. Otherwise its default will be used
175 /// given that a pass shall work at an optimization \p Level minimum.
176 bool isPassEnabled(const cl::opt<bool> &Opt,
177 CodeGenOptLevel Level = CodeGenOptLevel::Default) const;
178 void addEarlyCSEOrGVNPass(PassManagerWrapper &PMW) const;
179 void addStraightLineScalarOptimizationPasses(PassManagerWrapper &PMW) const;
180};
181
182class SGPRRegisterRegAlloc : public RegisterRegAllocBase<SGPRRegisterRegAlloc> {
183public:
184 SGPRRegisterRegAlloc(const char *N, const char *D, FunctionPassCtor C)
185 : RegisterRegAllocBase(N, D, C) {}
186};
187
188class VGPRRegisterRegAlloc : public RegisterRegAllocBase<VGPRRegisterRegAlloc> {
189public:
190 VGPRRegisterRegAlloc(const char *N, const char *D, FunctionPassCtor C)
191 : RegisterRegAllocBase(N, D, C) {}
192};
193
194class WWMRegisterRegAlloc : public RegisterRegAllocBase<WWMRegisterRegAlloc> {
195public:
196 WWMRegisterRegAlloc(const char *N, const char *D, FunctionPassCtor C)
197 : RegisterRegAllocBase(N, D, C) {}
198};
199
200static bool onlyAllocateSGPRs(const TargetRegisterInfo &TRI,
201 const MachineRegisterInfo &MRI,
202 const Register Reg) {
203 const TargetRegisterClass *RC = MRI.getRegClass(Reg);
204 return static_cast<const SIRegisterInfo &>(TRI).isSGPRClass(RC);
205}
206
207static bool onlyAllocateVGPRs(const TargetRegisterInfo &TRI,
208 const MachineRegisterInfo &MRI,
209 const Register Reg) {
210 const TargetRegisterClass *RC = MRI.getRegClass(Reg);
211 return !static_cast<const SIRegisterInfo &>(TRI).isSGPRClass(RC);
212}
213
214static bool onlyAllocateWWMRegs(const TargetRegisterInfo &TRI,
215 const MachineRegisterInfo &MRI,
216 const Register Reg) {
217 const SIMachineFunctionInfo *MFI =
219 const TargetRegisterClass *RC = MRI.getRegClass(Reg);
220 return !static_cast<const SIRegisterInfo &>(TRI).isSGPRClass(RC) &&
222}
223
224/// -{sgpr|wwm|vgpr}-regalloc=... command line option.
225static FunctionPass *useDefaultRegisterAllocator() { return nullptr; }
226
227/// A dummy default pass factory indicates whether the register allocator is
228/// overridden on the command line.
229static llvm::once_flag InitializeDefaultSGPRRegisterAllocatorFlag;
230static llvm::once_flag InitializeDefaultVGPRRegisterAllocatorFlag;
231static llvm::once_flag InitializeDefaultWWMRegisterAllocatorFlag;
232
233static SGPRRegisterRegAlloc
234defaultSGPRRegAlloc("default",
235 "pick SGPR register allocator based on -O option",
237
238static cl::opt<SGPRRegisterRegAlloc::FunctionPassCtor, false,
240SGPRRegAlloc("sgpr-regalloc", cl::Hidden, cl::init(&useDefaultRegisterAllocator),
241 cl::desc("Register allocator to use for SGPRs"));
242
243static cl::opt<VGPRRegisterRegAlloc::FunctionPassCtor, false,
245VGPRRegAlloc("vgpr-regalloc", cl::Hidden, cl::init(&useDefaultRegisterAllocator),
246 cl::desc("Register allocator to use for VGPRs"));
247
248static cl::opt<WWMRegisterRegAlloc::FunctionPassCtor, false,
250 WWMRegAlloc("wwm-regalloc", cl::Hidden,
252 cl::desc("Register allocator to use for WWM registers"));
253
254// New pass manager register allocator options for AMDGPU
256 "sgpr-regalloc-npm", cl::Hidden, cl::init(RegAllocType::Default),
257 cl::desc("Register allocator for SGPRs (new pass manager)"));
258
260 "vgpr-regalloc-npm", cl::Hidden, cl::init(RegAllocType::Default),
261 cl::desc("Register allocator for VGPRs (new pass manager)"));
262
264 "wwm-regalloc-npm", cl::Hidden, cl::init(RegAllocType::Default),
265 cl::desc("Register allocator for WWM registers (new pass manager)"));
266
267/// Check if the given RegAllocType is supported for AMDGPU NPM register
268/// allocation. Only Fast and Greedy are supported; Basic and PBQP are not.
269static Error checkRegAllocSupported(RegAllocType RAType, StringRef RegName) {
270 if (RAType == RegAllocType::Basic || RAType == RegAllocType::PBQP) {
272 Twine("unsupported register allocator '") +
273 (RAType == RegAllocType::Basic ? "basic" : "pbqp") + "' for " +
274 RegName + " registers",
276 }
277 return Error::success();
278}
279
280Error AMDGPUCodeGenPassBuilder::validateRegAllocOptions() const {
281 // 1. Generic --regalloc-npm is not supported for AMDGPU.
282 if (Opt.RegAlloc != RegAllocType::Unset) {
284 "-regalloc-npm not supported for amdgcn. Use -sgpr-regalloc-npm, "
285 "-vgpr-regalloc-npm, and -wwm-regalloc-npm",
287 }
288
289 // 2. Legacy PM regalloc options are not compatible with NPM.
290 if (SGPRRegAlloc.getNumOccurrences() > 0 ||
291 VGPRRegAlloc.getNumOccurrences() > 0 ||
292 WWMRegAlloc.getNumOccurrences() > 0) {
294 "-sgpr-regalloc, -vgpr-regalloc, and -wwm-regalloc are legacy PM "
295 "options. Use -sgpr-regalloc-npm, -vgpr-regalloc-npm, and "
296 "-wwm-regalloc-npm with the new pass manager",
298 }
299
300 // 3. Only Fast and Greedy allocators are supported for AMDGPU.
301 if (auto Err = checkRegAllocSupported(SGPRRegAllocNPM, "SGPR"))
302 return Err;
303 if (auto Err = checkRegAllocSupported(WWMRegAllocNPM, "WWM"))
304 return Err;
305 if (auto Err = checkRegAllocSupported(VGPRRegAllocNPM, "VGPR"))
306 return Err;
307
308 return Error::success();
309}
310
311static void initializeDefaultSGPRRegisterAllocatorOnce() {
312 RegisterRegAlloc::FunctionPassCtor Ctor = SGPRRegisterRegAlloc::getDefault();
313
314 if (!Ctor) {
315 Ctor = SGPRRegAlloc;
316 SGPRRegisterRegAlloc::setDefault(SGPRRegAlloc);
317 }
318}
319
320static void initializeDefaultVGPRRegisterAllocatorOnce() {
321 RegisterRegAlloc::FunctionPassCtor Ctor = VGPRRegisterRegAlloc::getDefault();
322
323 if (!Ctor) {
324 Ctor = VGPRRegAlloc;
325 VGPRRegisterRegAlloc::setDefault(VGPRRegAlloc);
326 }
327}
328
329static void initializeDefaultWWMRegisterAllocatorOnce() {
330 RegisterRegAlloc::FunctionPassCtor Ctor = WWMRegisterRegAlloc::getDefault();
331
332 if (!Ctor) {
333 Ctor = WWMRegAlloc;
334 WWMRegisterRegAlloc::setDefault(WWMRegAlloc);
335 }
336}
337
338static FunctionPass *createBasicSGPRRegisterAllocator() {
339 return createBasicRegisterAllocator(onlyAllocateSGPRs);
340}
341
342static FunctionPass *createGreedySGPRRegisterAllocator() {
343 return createGreedyRegisterAllocator(onlyAllocateSGPRs);
344}
345
346static FunctionPass *createFastSGPRRegisterAllocator() {
347 return createFastRegisterAllocator(onlyAllocateSGPRs, false);
348}
349
350static FunctionPass *createBasicVGPRRegisterAllocator() {
351 return createBasicRegisterAllocator(onlyAllocateVGPRs);
352}
353
354static FunctionPass *createGreedyVGPRRegisterAllocator() {
355 return createGreedyRegisterAllocator(onlyAllocateVGPRs);
356}
357
358static FunctionPass *createFastVGPRRegisterAllocator() {
359 return createFastRegisterAllocator(onlyAllocateVGPRs, true);
360}
361
362static FunctionPass *createBasicWWMRegisterAllocator() {
363 return createBasicRegisterAllocator(onlyAllocateWWMRegs);
364}
365
366static FunctionPass *createGreedyWWMRegisterAllocator() {
367 return createGreedyRegisterAllocator(onlyAllocateWWMRegs);
368}
369
370static FunctionPass *createFastWWMRegisterAllocator() {
371 return createFastRegisterAllocator(onlyAllocateWWMRegs, false);
372}
373
374static SGPRRegisterRegAlloc basicRegAllocSGPR(
375 "basic", "basic register allocator", createBasicSGPRRegisterAllocator);
376static SGPRRegisterRegAlloc greedyRegAllocSGPR(
377 "greedy", "greedy register allocator", createGreedySGPRRegisterAllocator);
378
379static SGPRRegisterRegAlloc fastRegAllocSGPR(
380 "fast", "fast register allocator", createFastSGPRRegisterAllocator);
381
382
383static VGPRRegisterRegAlloc basicRegAllocVGPR(
384 "basic", "basic register allocator", createBasicVGPRRegisterAllocator);
385static VGPRRegisterRegAlloc greedyRegAllocVGPR(
386 "greedy", "greedy register allocator", createGreedyVGPRRegisterAllocator);
387
388static VGPRRegisterRegAlloc fastRegAllocVGPR(
389 "fast", "fast register allocator", createFastVGPRRegisterAllocator);
390static WWMRegisterRegAlloc basicRegAllocWWMReg("basic",
391 "basic register allocator",
392 createBasicWWMRegisterAllocator);
393static WWMRegisterRegAlloc
394 greedyRegAllocWWMReg("greedy", "greedy register allocator",
395 createGreedyWWMRegisterAllocator);
396static WWMRegisterRegAlloc fastRegAllocWWMReg("fast", "fast register allocator",
397 createFastWWMRegisterAllocator);
398
400 return Phase == ThinOrFullLTOPhase::FullLTOPreLink ||
401 Phase == ThinOrFullLTOPhase::ThinLTOPreLink;
402}
403} // anonymous namespace
404
405static cl::opt<bool>
407 cl::desc("Run early if-conversion"),
408 cl::init(false));
409
410static cl::opt<bool>
411OptExecMaskPreRA("amdgpu-opt-exec-mask-pre-ra", cl::Hidden,
412 cl::desc("Run pre-RA exec mask optimizations"),
413 cl::init(true));
414
415static cl::opt<bool>
416 LowerCtorDtor("amdgpu-lower-global-ctor-dtor",
417 cl::desc("Lower GPU ctor / dtors to globals on the device."),
418 cl::init(true), cl::Hidden);
419
420// Option to disable vectorizer for tests.
422 "amdgpu-load-store-vectorizer",
423 cl::desc("Enable load store vectorizer"),
424 cl::init(true),
425 cl::Hidden);
426
427// Option to control global loads scalarization
429 "amdgpu-scalarize-global-loads",
430 cl::desc("Enable global load scalarization"),
431 cl::init(true),
432 cl::Hidden);
433
434// Option to run internalize pass.
436 "amdgpu-internalize-symbols",
437 cl::desc("Enable elimination of non-kernel functions and unused globals"),
438 cl::init(false),
439 cl::Hidden);
440
441// Option to inline all early.
443 "amdgpu-early-inline-all",
444 cl::desc("Inline all functions early"),
445 cl::init(false),
446 cl::Hidden);
447
449 "amdgpu-enable-remove-incompatible-functions", cl::Hidden,
450 cl::desc("Enable removal of functions when they"
451 "use features not supported by the target GPU"),
452 cl::init(true));
453
455 "amdgpu-sdwa-peephole",
456 cl::desc("Enable SDWA peepholer"),
457 cl::init(true));
458
460 "amdgpu-dpp-combine",
461 cl::desc("Enable DPP combiner"),
462 cl::init(true));
463
464// Enable address space based alias analysis
466 cl::desc("Enable AMDGPU Alias Analysis"),
467 cl::init(true));
468
469// Enable lib calls simplifications
471 "amdgpu-simplify-libcall",
472 cl::desc("Enable amdgpu library simplifications"),
473 cl::init(true),
474 cl::Hidden);
475
477 "amdgpu-ir-lower-kernel-arguments",
478 cl::desc("Lower kernel argument loads in IR pass"),
479 cl::init(true),
480 cl::Hidden);
481
483 "amdgpu-reassign-regs",
484 cl::desc("Enable register reassign optimizations on gfx10+"),
485 cl::init(true),
486 cl::Hidden);
487
489 "amdgpu-opt-vgpr-liverange",
490 cl::desc("Enable VGPR liverange optimizations for if-else structure"),
491 cl::init(true), cl::Hidden);
492
494 "amdgpu-atomic-optimizer-strategy",
495 cl::desc("Select DPP or Iterative strategy for scan"),
498 clEnumValN(ScanOptions::DPP, "DPP", "Use DPP operations for scan"),
500 "Use Iterative approach for scan"),
501 clEnumValN(ScanOptions::None, "None", "Disable atomic optimizer")));
502
503// Enable Mode register optimization
505 "amdgpu-mode-register",
506 cl::desc("Enable mode register pass"),
507 cl::init(true),
508 cl::Hidden);
509
510// Enable GFX11+ s_delay_alu insertion
511static cl::opt<bool>
512 EnableInsertDelayAlu("amdgpu-enable-delay-alu",
513 cl::desc("Enable s_delay_alu insertion"),
514 cl::init(true), cl::Hidden);
515
516// Enable GFX11+ VOPD
517static cl::opt<bool>
518 EnableVOPD("amdgpu-enable-vopd",
519 cl::desc("Enable VOPD, dual issue of VALU in wave32"),
520 cl::init(true), cl::Hidden);
521
522// Option is used in lit tests to prevent deadcoding of patterns inspected.
523static cl::opt<bool>
524EnableDCEInRA("amdgpu-dce-in-ra",
525 cl::init(true), cl::Hidden,
526 cl::desc("Enable machine DCE inside regalloc"));
527
528static cl::opt<bool> EnableSetWavePriority("amdgpu-set-wave-priority",
529 cl::desc("Adjust wave priority"),
530 cl::init(false), cl::Hidden);
531
533 "amdgpu-scalar-ir-passes",
534 cl::desc("Enable scalar IR passes"),
535 cl::init(true),
536 cl::Hidden);
537
539 "amdgpu-enable-lower-exec-sync",
540 cl::desc("Enable lowering of execution synchronization."), cl::init(true),
541 cl::Hidden);
542
543static cl::opt<bool>
544 EnableSwLowerLDS("amdgpu-enable-sw-lower-lds",
545 cl::desc("Enable lowering of lds to global memory pass "
546 "and asan instrument resulting IR."),
547 cl::init(true), cl::Hidden);
548
550 "amdgpu-enable-lower-module-lds", cl::desc("Enable lower module lds pass"),
552 cl::Hidden);
553
555 "amdgpu-enable-pre-ra-optimizations",
556 cl::desc("Enable Pre-RA optimizations pass"), cl::init(true),
557 cl::Hidden);
558
560 "amdgpu-enable-promote-kernel-arguments",
561 cl::desc("Enable promotion of flat kernel pointer arguments to global"),
562 cl::Hidden, cl::init(true));
563
565 "amdgpu-enable-image-intrinsic-optimizer",
566 cl::desc("Enable image intrinsic optimizer pass"), cl::init(true),
567 cl::Hidden);
568
569static cl::opt<bool>
570 EnableLoopPrefetch("amdgpu-loop-prefetch",
571 cl::desc("Enable loop data prefetch on AMDGPU"),
572 cl::Hidden, cl::init(false));
573
575 AMDGPUSchedStrategy("amdgpu-sched-strategy",
576 cl::desc("Select custom AMDGPU scheduling strategy."),
577 cl::Hidden, cl::init(""));
578
579// Scheduler selection is consulted both when creating the scheduler and from
580// overrideSchedPolicy(), so keep the attribute and global command line handling
581// in one helper.
583 Attribute SchedStrategyAttr = F.getFnAttribute("amdgpu-sched-strategy");
584 if (SchedStrategyAttr.isValid())
585 return SchedStrategyAttr.getValueAsString();
586
587 if (!AMDGPUSchedStrategy.empty())
588 return AMDGPUSchedStrategy;
589
590 return "";
591}
592
593static void
595 const GCNSubtarget &ST) {
596 if (ST.hasGFX1250Insts())
597 return;
598
599 F.getContext().diagnose(DiagnosticInfoUnsupported(
600 F, "'amdgpu-sched-strategy'='coexec' is only supported for gfx1250",
602}
603
604static bool useNoopPostScheduler(const Function &F) {
605 Attribute PostSchedStrategyAttr =
606 F.getFnAttribute("amdgpu-post-sched-strategy");
607 return PostSchedStrategyAttr.isValid() &&
608 PostSchedStrategyAttr.getValueAsString() == "nop";
609}
610
612 "amdgpu-enable-rewrite-partial-reg-uses",
613 cl::desc("Enable rewrite partial reg uses pass"), cl::init(true),
614 cl::Hidden);
615
617 "amdgpu-enable-hipstdpar",
618 cl::desc("Enable HIP Standard Parallelism Offload support"), cl::init(false),
619 cl::Hidden);
620
621static cl::opt<bool>
622 EnableAMDGPUAttributor("amdgpu-attributor-enable",
623 cl::desc("Enable AMDGPUAttributorPass"),
624 cl::init(true), cl::Hidden);
625
627 "new-reg-bank-select",
628 cl::desc("Run amdgpu-regbankselect and amdgpu-regbanklegalize instead of "
629 "regbankselect"),
630 cl::init(false), cl::Hidden);
631
633 "amdgpu-link-time-closed-world",
634 cl::desc("Whether has closed-world assumption at link time"),
635 cl::init(false), cl::Hidden);
636
638 "amdgpu-enable-uniform-intrinsic-combine",
639 cl::desc("Enable/Disable the Uniform Intrinsic Combine Pass"),
640 cl::init(true), cl::Hidden);
641
643 // Register the target
646
730}
731
732static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) {
733 return std::make_unique<AMDGPUTargetObjectFile>();
734}
735
739
740static ScheduleDAGInstrs *
742 const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>();
743 ScheduleDAGMILive *DAG =
744 new GCNScheduleDAGMILive(C, std::make_unique<GCNMaxOccupancySchedStrategy>(C));
745 DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
746 if (ST.shouldClusterStores())
747 DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
749 DAG->addMutation(createAMDGPUMacroFusionDAGMutation());
750 DAG->addMutation(createAMDGPUExportClusteringDAGMutation());
751 DAG->addMutation(createAMDGPUBarrierLatencyDAGMutation(C->MF));
752 DAG->addMutation(createAMDGPUHazardLatencyDAGMutation(C->MF));
753 return DAG;
754}
755
756static ScheduleDAGInstrs *
758 ScheduleDAGMILive *DAG =
759 new GCNScheduleDAGMILive(C, std::make_unique<GCNMaxILPSchedStrategy>(C));
761 return DAG;
762}
763
764static ScheduleDAGInstrs *
766 const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>();
768 C, std::make_unique<GCNMaxMemoryClauseSchedStrategy>(C));
769 DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
770 if (ST.shouldClusterStores())
771 DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
772 DAG->addMutation(createAMDGPUExportClusteringDAGMutation());
773 DAG->addMutation(createAMDGPUBarrierLatencyDAGMutation(C->MF));
774 DAG->addMutation(createAMDGPUHazardLatencyDAGMutation(C->MF));
775 return DAG;
776}
777
778static ScheduleDAGInstrs *
780 const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>();
781 auto *DAG = new GCNIterativeScheduler(
783 DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
784 if (ST.shouldClusterStores())
785 DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
787 return DAG;
788}
789
796
797static ScheduleDAGInstrs *
799 const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>();
801 DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
802 if (ST.shouldClusterStores())
803 DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
804 DAG->addMutation(createAMDGPUMacroFusionDAGMutation());
806 return DAG;
807}
808
809static MachineSchedRegistry
810SISchedRegistry("si", "Run SI's custom scheduler",
812
815 "Run GCN scheduler to maximize occupancy",
817
819 GCNMaxILPSchedRegistry("gcn-max-ilp", "Run GCN scheduler to maximize ilp",
821
823 "gcn-max-memory-clause", "Run GCN scheduler to maximize memory clause",
825
827 "gcn-iterative-max-occupancy-experimental",
828 "Run GCN scheduler to maximize occupancy (experimental)",
830
832 "gcn-iterative-minreg",
833 "Run GCN iterative scheduler for minimal register usage (experimental)",
835
837 "gcn-iterative-ilp",
838 "Run GCN iterative scheduler for ILP scheduling (experimental)",
840
843 if (!GPU.empty())
844 return GPU;
845
846 // Need to default to a target with flat support for HSA.
847 if (TT.isAMDGCN())
848 return TT.getOS() == Triple::AMDHSA ? "generic-hsa" : "generic";
849
850 return "r600";
851}
852
854 // The AMDGPU toolchain only supports generating shared objects, so we
855 // must always use PIC.
856 return Reloc::PIC_;
857}
858
860 StringRef CPU, StringRef FS,
861 const TargetOptions &Options,
862 std::optional<Reloc::Model> RM,
863 std::optional<CodeModel::Model> CM,
866 T, TT.computeDataLayout(), TT, getGPUOrDefault(TT, CPU), FS, Options,
868 OptLevel),
870 initAsmInfo();
871 if (TT.isAMDGCN()) {
872 if (getMCSubtargetInfo()->checkFeatures("+wavefrontsize64"))
874 else if (getMCSubtargetInfo()->checkFeatures("+wavefrontsize32"))
876 }
877}
878
881
883
885 Attribute GPUAttr = F.getFnAttribute("target-cpu");
886 return GPUAttr.isValid() ? GPUAttr.getValueAsString() : getTargetCPU();
887}
888
890 Attribute FSAttr = F.getFnAttribute("target-features");
891
892 return FSAttr.isValid() ? FSAttr.getValueAsString()
894}
895
898 const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>();
900 DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
901 if (ST.shouldClusterStores())
902 DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
903 return DAG;
904}
905
906/// Predicate for Internalize pass.
907static bool mustPreserveGV(const GlobalValue &GV) {
908 if (const Function *F = dyn_cast<Function>(&GV))
909 return F->isDeclaration() || F->getName().starts_with("__asan_") ||
910 F->getName().starts_with("__sanitizer_") ||
911 AMDGPU::isEntryFunctionCC(F->getCallingConv());
912
914 return !GV.use_empty();
915}
916
921
924 if (Params.empty())
926 Params.consume_front("strategy=");
927 auto Result = StringSwitch<std::optional<ScanOptions>>(Params)
928 .Case("dpp", ScanOptions::DPP)
929 .Cases({"iterative", ""}, ScanOptions::Iterative)
930 .Case("none", ScanOptions::None)
931 .Default(std::nullopt);
932 if (Result)
933 return *Result;
934 return make_error<StringError>("invalid parameter", inconvertibleErrorCode());
935}
936
940 while (!Params.empty()) {
941 StringRef ParamName;
942 std::tie(ParamName, Params) = Params.split(';');
943 if (ParamName == "closed-world") {
944 Result.IsClosedWorld = true;
945 } else {
947 formatv("invalid AMDGPUAttributor pass parameter '{0}' ", ParamName)
948 .str(),
950 }
951 }
952 return Result;
953}
954
956
957#define GET_PASS_REGISTRY "AMDGPUPassRegistry.def"
959
960 PB.registerPipelineParsingCallback(
961 [this](StringRef Name, CGSCCPassManager &PM,
963 if (Name == "amdgpu-attributor-cgscc" && getTargetTriple().isAMDGCN()) {
965 *static_cast<GCNTargetMachine *>(this)));
966 return true;
967 }
968 return false;
969 });
970
971 PB.registerScalarOptimizerLateEPCallback(
972 [](FunctionPassManager &FPM, OptimizationLevel Level) {
973 if (Level == OptimizationLevel::O0)
974 return;
975
977 });
978
979 PB.registerVectorizerEndEPCallback(
980 [](FunctionPassManager &FPM, OptimizationLevel Level) {
981 if (Level == OptimizationLevel::O0)
982 return;
983
985 });
986
987 PB.registerPipelineEarlySimplificationEPCallback(
988 [this](ModulePassManager &PM, OptimizationLevel Level,
990 if (!isLTOPreLink(Phase) && getTargetTriple().isAMDGCN()) {
991 // When we are not using -fgpu-rdc, we can run accelerator code
992 // selection relatively early, but still after linking to prevent
993 // eager removal of potentially reachable symbols.
994 if (EnableHipStdPar) {
997 }
998
1000 }
1001
1002 if (Level == OptimizationLevel::O0)
1003 return;
1004
1005 // We don't want to run internalization at per-module stage.
1008 PM.addPass(GlobalDCEPass());
1009 }
1010
1013 });
1014
1015 PB.registerPeepholeEPCallback(
1016 [](FunctionPassManager &FPM, OptimizationLevel Level) {
1017 if (Level == OptimizationLevel::O0)
1018 return;
1019
1023
1026 });
1027
1028 PB.registerCGSCCOptimizerLateEPCallback(
1029 [this](CGSCCPassManager &PM, OptimizationLevel Level) {
1030 if (Level == OptimizationLevel::O0)
1031 return;
1032
1034
1035 // Add promote kernel arguments pass to the opt pipeline right before
1036 // infer address spaces which is needed to do actual address space
1037 // rewriting.
1038 if (Level.getSpeedupLevel() > OptimizationLevel::O1.getSpeedupLevel() &&
1041
1042 // Add infer address spaces pass to the opt pipeline after inlining
1043 // but before SROA to increase SROA opportunities.
1045
1046 // This should run after inlining to have any chance of doing
1047 // anything, and before other cleanup optimizations.
1049
1050 if (Level != OptimizationLevel::O0) {
1051 // Promote alloca to vector before SROA and loop unroll. If we
1052 // manage to eliminate allocas before unroll we may choose to unroll
1053 // less.
1055 }
1056
1057 PM.addPass(createCGSCCToFunctionPassAdaptor(std::move(FPM)));
1058 });
1059
1060 // FIXME: Why is AMDGPUAttributor not in CGSCC?
1061 PB.registerOptimizerLastEPCallback([this](ModulePassManager &MPM,
1062 OptimizationLevel Level,
1064 if (Level != OptimizationLevel::O0) {
1065 if (!isLTOPreLink(Phase)) {
1066 if (EnableAMDGPUAttributor && getTargetTriple().isAMDGCN()) {
1068 MPM.addPass(AMDGPUAttributorPass(*this, Opts, Phase));
1069 }
1070 }
1071 }
1072 });
1073
1074 PB.registerFullLinkTimeOptimizationLastEPCallback(
1075 [this](ModulePassManager &PM, OptimizationLevel Level) {
1076 // When we are using -fgpu-rdc, we can only run accelerator code
1077 // selection after linking to prevent, otherwise we end up removing
1078 // potentially reachable symbols that were exported as external in other
1079 // modules.
1080 if (EnableHipStdPar) {
1083 }
1084 // We want to support the -lto-partitions=N option as "best effort".
1085 // For that, we need to lower LDS earlier in the pipeline before the
1086 // module is partitioned for codegen.
1089 if (EnableSwLowerLDS)
1090 PM.addPass(AMDGPUSwLowerLDSPass(*this));
1093 if (Level != OptimizationLevel::O0) {
1094 // We only want to run this with O2 or higher since inliner and SROA
1095 // don't run in O1.
1096 if (Level != OptimizationLevel::O1) {
1097 PM.addPass(
1099 }
1100 // Do we really need internalization in LTO?
1101 if (InternalizeSymbols) {
1103 PM.addPass(GlobalDCEPass());
1104 }
1105 if (EnableAMDGPUAttributor && getTargetTriple().isAMDGCN()) {
1108 Opt.IsClosedWorld = true;
1111 }
1112 }
1113 if (!NoKernelInfoEndLTO) {
1115 FPM.addPass(KernelInfoPrinter(this));
1116 PM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM)));
1117 }
1118 });
1119
1120 PB.registerRegClassFilterParsingCallback(
1121 [](StringRef FilterName) -> RegAllocFilterFunc {
1122 if (FilterName == "sgpr")
1123 return onlyAllocateSGPRs;
1124 if (FilterName == "vgpr")
1125 return onlyAllocateVGPRs;
1126 if (FilterName == "wwm")
1127 return onlyAllocateWWMRegs;
1128 return nullptr;
1129 });
1130}
1131
1133 unsigned DestAS) const {
1134 return AMDGPU::isFlatGlobalAddrSpace(SrcAS) &&
1136}
1137
1139 if (auto *Arg = dyn_cast<Argument>(V);
1140 Arg &&
1141 AMDGPU::isModuleEntryFunctionCC(Arg->getParent()->getCallingConv()) &&
1142 !Arg->hasByRefAttr())
1144
1145 const auto *LD = dyn_cast<LoadInst>(V);
1146 if (!LD) // TODO: Handle invariant load like constant.
1148
1149 // It must be a generic pointer loaded.
1150 assert(V->getType()->getPointerAddressSpace() == AMDGPUAS::FLAT_ADDRESS);
1151
1152 const auto *Ptr = LD->getPointerOperand();
1153 if (Ptr->getType()->getPointerAddressSpace() != AMDGPUAS::CONSTANT_ADDRESS)
1155 // For a generic pointer loaded from the constant memory, it could be assumed
1156 // as a global pointer since the constant memory is only populated on the
1157 // host side. As implied by the offload programming model, only global
1158 // pointers could be referenced on the host side.
1160}
1161
1162std::pair<const Value *, unsigned>
1164 if (auto *II = dyn_cast<IntrinsicInst>(V)) {
1165 switch (II->getIntrinsicID()) {
1166 case Intrinsic::amdgcn_is_shared:
1167 return std::pair(II->getArgOperand(0), AMDGPUAS::LOCAL_ADDRESS);
1168 case Intrinsic::amdgcn_is_private:
1169 return std::pair(II->getArgOperand(0), AMDGPUAS::PRIVATE_ADDRESS);
1170 default:
1171 break;
1172 }
1173 return std::pair(nullptr, -1);
1174 }
1175 // Check the global pointer predication based on
1176 // (!is_share(p) && !is_private(p)). Note that logic 'and' is commutative and
1177 // the order of 'is_shared' and 'is_private' is not significant.
1178 Value *Ptr;
1179 if (match(
1180 const_cast<Value *>(V),
1183 m_Deferred(Ptr))))))
1184 return std::pair(Ptr, AMDGPUAS::GLOBAL_ADDRESS);
1185
1186 return std::pair(nullptr, -1);
1187}
1188
1189unsigned
1204
1206 Module &M, unsigned NumParts,
1207 function_ref<void(std::unique_ptr<Module> MPart)> ModuleCallback) {
1208 // FIXME(?): Would be better to use an already existing Analysis/PassManager,
1209 // but all current users of this API don't have one ready and would need to
1210 // create one anyway. Let's hide the boilerplate for now to keep it simple.
1211
1216
1217 PassBuilder PB(this);
1218 PB.registerModuleAnalyses(MAM);
1219 PB.registerFunctionAnalyses(FAM);
1220 PB.crossRegisterProxies(LAM, FAM, CGAM, MAM);
1221
1223 MPM.addPass(AMDGPUSplitModulePass(NumParts, ModuleCallback));
1224 MPM.run(M, MAM);
1225 return true;
1226}
1227
1228//===----------------------------------------------------------------------===//
1229// GCN Target Machine (SI+)
1230//===----------------------------------------------------------------------===//
1231
1233 StringRef CPU, StringRef FS,
1234 const TargetOptions &Options,
1235 std::optional<Reloc::Model> RM,
1236 std::optional<CodeModel::Model> CM,
1237 CodeGenOptLevel OL, bool JIT)
1238 : AMDGPUTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL) {}
1239
1240const TargetSubtargetInfo *
1242 StringRef GPU = getGPUName(F);
1244
1245 SmallString<128> SubtargetKey(GPU);
1246 SubtargetKey.append(FS);
1247
1248 auto &I = SubtargetMap[SubtargetKey];
1249 if (!I) {
1250 // This needs to be done before we create a new subtarget since any
1251 // creation will depend on the TM and the code generation flags on the
1252 // function that reside in TargetOptions.
1254 I = std::make_unique<GCNSubtarget>(TargetTriple, GPU, FS, *this);
1255 }
1256
1257 I->setScalarizeGlobalBehavior(ScalarizeGlobal);
1258
1259 return I.get();
1260}
1261
1264 return TargetTransformInfo(std::make_unique<GCNTTIImpl>(this, F));
1265}
1266
1269 raw_pwrite_stream *DwoOut, CodeGenFileType FileType,
1270 const CGPassBuilderOption &Opts, MCContext &Ctx,
1272 AMDGPUCodeGenPassBuilder CGPB(*this, Opts, PIC);
1273 return CGPB.buildPipeline(MPM, MAM, Out, DwoOut, FileType, Ctx);
1274}
1275
1278 const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>();
1279 if (ST.enableSIScheduler())
1281
1282 StringRef SchedStrategy = AMDGPU::getSchedStrategy(C->MF->getFunction());
1283
1284 if (SchedStrategy == "max-ilp")
1286
1287 if (SchedStrategy == "max-memory-clause")
1289
1290 if (SchedStrategy == "iterative-ilp")
1292
1293 if (SchedStrategy == "iterative-minreg")
1294 return createMinRegScheduler(C);
1295
1296 if (SchedStrategy == "iterative-maxocc")
1298
1299 if (SchedStrategy == "coexec") {
1300 diagnoseUnsupportedCoExecSchedulerSelection(C->MF->getFunction(), ST);
1302 }
1303
1305}
1306
1309 if (useNoopPostScheduler(C->MF->getFunction()))
1311
1312 ScheduleDAGMI *DAG =
1313 new GCNPostScheduleDAGMILive(C, std::make_unique<PostGenericScheduler>(C),
1314 /*RemoveKillFlags=*/true);
1315 const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>();
1317 if (ST.shouldClusterStores())
1320 if ((EnableVOPD.getNumOccurrences() ||
1322 EnableVOPD)
1327 return DAG;
1328}
1329//===----------------------------------------------------------------------===//
1330// AMDGPU Legacy Pass Setup
1331//===----------------------------------------------------------------------===//
1332
1333std::unique_ptr<CSEConfigBase> llvm::AMDGPUPassConfig::getCSEConfig() const {
1334 return getStandardCSEConfigForOpt(TM->getOptLevel());
1335}
1336
1337namespace {
1338
1339class GCNPassConfig final : public AMDGPUPassConfig {
1340public:
1341 GCNPassConfig(TargetMachine &TM, PassManagerBase &PM)
1342 : AMDGPUPassConfig(TM, PM) {
1343 substitutePass(&PostRASchedulerID, &PostMachineSchedulerID);
1344 }
1345
1346 GCNTargetMachine &getGCNTargetMachine() const {
1347 return getTM<GCNTargetMachine>();
1348 }
1349
1350 bool addPreISel() override;
1351 void addMachineSSAOptimization() override;
1352 bool addILPOpts() override;
1353 bool addInstSelector() override;
1354 bool addIRTranslator() override;
1355 void addPreLegalizeMachineIR() override;
1356 bool addLegalizeMachineIR() override;
1357 void addPreRegBankSelect() override;
1358 bool addRegBankSelect() override;
1359 void addPreGlobalInstructionSelect() override;
1360 bool addGlobalInstructionSelect() override;
1361 void addPreRegAlloc() override;
1362 void addFastRegAlloc() override;
1363 void addOptimizedRegAlloc() override;
1364
1365 FunctionPass *createSGPRAllocPass(bool Optimized);
1366 FunctionPass *createVGPRAllocPass(bool Optimized);
1367 FunctionPass *createWWMRegAllocPass(bool Optimized);
1368 FunctionPass *createRegAllocPass(bool Optimized) override;
1369
1370 bool addRegAssignAndRewriteFast() override;
1371 bool addRegAssignAndRewriteOptimized() override;
1372
1373 bool addPreRewrite() override;
1374 void addPostRegAlloc() override;
1375 void addPreSched2() override;
1376 void addPreEmitPass() override;
1377 void addPostBBSections() override;
1378};
1379
1380} // end anonymous namespace
1381
1383 : TargetPassConfig(TM, PM) {
1384 // Exceptions and StackMaps are not supported, so these passes will never do
1385 // anything.
1388 // Garbage collection is not supported.
1391}
1392
1399
1404 // ReassociateGEPs exposes more opportunities for SLSR. See
1405 // the example in reassociate-geps-and-slsr.ll.
1407 // SeparateConstOffsetFromGEP and SLSR creates common expressions which GVN or
1408 // EarlyCSE can reuse.
1410 // Run NaryReassociate after EarlyCSE/GVN to be more effective.
1412 // NaryReassociate on GEPs creates redundant common expressions, so run
1413 // EarlyCSE after it.
1415}
1416
1419
1420 if (RemoveIncompatibleFunctions && TM.getTargetTriple().isAMDGCN())
1422
1423 // There is no reason to run these.
1427
1428 if (TM.getTargetTriple().isAMDGCN())
1430
1431 if (LowerCtorDtor)
1433
1434 if (TM.getTargetTriple().isAMDGCN() &&
1437
1440
1441 // This can be disabled by passing ::Disable here or on the command line
1442 // with --expand-variadics-override=disable.
1444
1445 // Function calls are not supported, so make sure we inline everything.
1448
1449 // Handle uses of OpenCL image2d_t, image3d_t and sampler_t arguments.
1450 if (TM.getTargetTriple().getArch() == Triple::r600)
1452
1453 // Make enqueued block runtime handles externally visible.
1455
1456 // Lower special LDS accesses.
1459
1460 // Lower LDS accesses to global memory pass if address sanitizer is enabled.
1461 if (EnableSwLowerLDS)
1463
1464 // Runs before PromoteAlloca so the latter can account for function uses
1467 }
1468
1469 // Run atomic optimizer before Atomic Expand
1470 if ((TM.getTargetTriple().isAMDGCN()) &&
1471 (TM.getOptLevel() >= CodeGenOptLevel::Less) &&
1474 }
1475
1477
1478 if (TM.getOptLevel() > CodeGenOptLevel::None) {
1480
1483
1487 AAResults &AAR) {
1488 if (auto *WrapperPass = P.getAnalysisIfAvailable<AMDGPUAAWrapperPass>())
1489 AAR.addAAResult(WrapperPass->getResult());
1490 }));
1491 }
1492
1493 if (TM.getTargetTriple().isAMDGCN()) {
1494 // TODO: May want to move later or split into an early and late one.
1496 }
1497
1498 // Try to hoist loop invariant parts of divisions AMDGPUCodeGenPrepare may
1499 // have expanded.
1500 if (TM.getOptLevel() > CodeGenOptLevel::Less)
1502 }
1503
1505
1506 // EarlyCSE is not always strong enough to clean up what LSR produces. For
1507 // example, GVN can combine
1508 //
1509 // %0 = add %a, %b
1510 // %1 = add %b, %a
1511 //
1512 // and
1513 //
1514 // %0 = shl nsw %a, 2
1515 // %1 = shl %a, 2
1516 //
1517 // but EarlyCSE can do neither of them.
1520}
1521
1523 if (TM->getTargetTriple().isAMDGCN() &&
1524 TM->getOptLevel() > CodeGenOptLevel::None)
1526
1527 if (TM->getTargetTriple().isAMDGCN() && EnableLowerKernelArguments)
1529
1531
1534
1535 if (TM->getTargetTriple().isAMDGCN()) {
1536 // This lowering has been placed after codegenprepare to take advantage of
1537 // address mode matching (which is why it isn't put with the LDS lowerings).
1538 // It could be placed anywhere before uniformity annotations (an analysis
1539 // that it changes by splitting up fat pointers into their components)
1540 // but has been put before switch lowering and CFG flattening so that those
1541 // passes can run on the more optimized control flow this pass creates in
1542 // many cases.
1545 }
1546
1547 // LowerSwitch pass may introduce unreachable blocks that can
1548 // cause unexpected behavior for subsequent passes. Placing it
1549 // here seems better that these blocks would get cleaned up by
1550 // UnreachableBlockElim inserted next in the pass flow.
1552}
1553
1555 if (TM->getOptLevel() > CodeGenOptLevel::None)
1557 return false;
1558}
1559
1564
1566 // Do nothing. GC is not supported.
1567 return false;
1568}
1569
1570//===----------------------------------------------------------------------===//
1571// GCN Legacy Pass Setup
1572//===----------------------------------------------------------------------===//
1573
1574bool GCNPassConfig::addPreISel() {
1576
1577 if (TM->getOptLevel() > CodeGenOptLevel::None)
1578 addPass(createSinkingPass());
1579
1580 if (TM->getOptLevel() > CodeGenOptLevel::None)
1582
1583 // Merge divergent exit nodes. StructurizeCFG won't recognize the multi-exit
1584 // regions formed by them.
1586 addPass(createFixIrreduciblePass());
1587 addPass(createUnifyLoopExitsPass());
1588 addPass(createStructurizeCFGPass(false)); // true -> SkipUniformRegions
1589
1592 // TODO: Move this right after structurizeCFG to avoid extra divergence
1593 // analysis. This depends on stopping SIAnnotateControlFlow from making
1594 // control flow modifications.
1596
1597 // SDAG requires LCSSA, GlobalISel does not. Disable LCSSA for -global-isel
1598 // with -new-reg-bank-select and without any of the fallback options.
1600 !isGlobalISelAbortEnabled() || !NewRegBankSelect)
1601 addPass(createLCSSAPass());
1602
1603 if (TM->getOptLevel() > CodeGenOptLevel::Less)
1605
1606 return false;
1607}
1608
1609void GCNPassConfig::addMachineSSAOptimization() {
1611
1612 // We want to fold operands after PeepholeOptimizer has run (or as part of
1613 // it), because it will eliminate extra copies making it easier to fold the
1614 // real source operand. We want to eliminate dead instructions after, so that
1615 // we see fewer uses of the copies. We then need to clean up the dead
1616 // instructions leftover after the operands are folded as well.
1617 //
1618 // XXX - Can we get away without running DeadMachineInstructionElim again?
1619 addPass(&SIFoldOperandsLegacyID);
1620 if (EnableDPPCombine)
1621 addPass(&GCNDPPCombineLegacyID);
1623 if (isPassEnabled(EnableSDWAPeephole)) {
1624 addPass(&SIPeepholeSDWALegacyID);
1625 addPass(&EarlyMachineLICMID);
1626 addPass(&MachineCSELegacyID);
1627 addPass(&SIFoldOperandsLegacyID);
1628 }
1631}
1632
1633bool GCNPassConfig::addILPOpts() {
1635 addPass(&EarlyIfConverterLegacyID);
1636
1638 return false;
1639}
1640
1641bool GCNPassConfig::addInstSelector() {
1643 addPass(&SIFixSGPRCopiesLegacyID);
1645 return false;
1646}
1647
1648bool GCNPassConfig::addIRTranslator() {
1649 addPass(new IRTranslator(getOptLevel()));
1650 return false;
1651}
1652
1653void GCNPassConfig::addPreLegalizeMachineIR() {
1654 bool IsOptNone = getOptLevel() == CodeGenOptLevel::None;
1655 addPass(createAMDGPUPreLegalizeCombiner(IsOptNone));
1656 addPass(new Localizer());
1657}
1658
1659bool GCNPassConfig::addLegalizeMachineIR() {
1660 addPass(new Legalizer());
1661 return false;
1662}
1663
1664void GCNPassConfig::addPreRegBankSelect() {
1665 bool IsOptNone = getOptLevel() == CodeGenOptLevel::None;
1666 addPass(createAMDGPUPostLegalizeCombiner(IsOptNone));
1668}
1669
1670bool GCNPassConfig::addRegBankSelect() {
1671 if (NewRegBankSelect) {
1674 } else {
1675 addPass(new RegBankSelect());
1676 }
1677 return false;
1678}
1679
1680void GCNPassConfig::addPreGlobalInstructionSelect() {
1681 bool IsOptNone = getOptLevel() == CodeGenOptLevel::None;
1682 addPass(createAMDGPURegBankCombiner(IsOptNone));
1683}
1684
1685bool GCNPassConfig::addGlobalInstructionSelect() {
1686 addPass(new InstructionSelect(getOptLevel()));
1687 return false;
1688}
1689
1690void GCNPassConfig::addFastRegAlloc() {
1691 // FIXME: We have to disable the verifier here because of PHIElimination +
1692 // TwoAddressInstructions disabling it.
1693
1694 // This must be run immediately after phi elimination and before
1695 // TwoAddressInstructions, otherwise the processing of the tied operand of
1696 // SI_ELSE will introduce a copy of the tied operand source after the else.
1698
1700
1702}
1703
1704void GCNPassConfig::addPreRegAlloc() {
1705 if (getOptLevel() != CodeGenOptLevel::None)
1707}
1708
1709void GCNPassConfig::addOptimizedRegAlloc() {
1710 if (EnableDCEInRA)
1712
1713 // FIXME: when an instruction has a Killed operand, and the instruction is
1714 // inside a bundle, seems only the BUNDLE instruction appears as the Kills of
1715 // the register in LiveVariables, this would trigger a failure in verifier,
1716 // we should fix it and enable the verifier.
1717 if (OptVGPRLiveRange)
1719
1720 // This must be run immediately after phi elimination and before
1721 // TwoAddressInstructions, otherwise the processing of the tied operand of
1722 // SI_ELSE will introduce a copy of the tied operand source after the else.
1724
1727
1728 if (isPassEnabled(EnablePreRAOptimizations))
1730
1731 // Allow the scheduler to run before SIWholeQuadMode inserts exec manipulation
1732 // instructions that cause scheduling barriers.
1734
1735 if (OptExecMaskPreRA)
1737
1738 // This is not an essential optimization and it has a noticeable impact on
1739 // compilation time, so we only enable it from O2.
1740 if (TM->getOptLevel() > CodeGenOptLevel::Less)
1742
1744}
1745
1746bool GCNPassConfig::addPreRewrite() {
1748 addPass(&GCNNSAReassignID);
1749
1751 return true;
1752}
1753
1754FunctionPass *GCNPassConfig::createSGPRAllocPass(bool Optimized) {
1755 // Initialize the global default.
1756 llvm::call_once(InitializeDefaultSGPRRegisterAllocatorFlag,
1757 initializeDefaultSGPRRegisterAllocatorOnce);
1758
1759 RegisterRegAlloc::FunctionPassCtor Ctor = SGPRRegisterRegAlloc::getDefault();
1760 if (Ctor != useDefaultRegisterAllocator)
1761 return Ctor();
1762
1763 if (Optimized)
1764 return createGreedyRegisterAllocator(onlyAllocateSGPRs);
1765
1766 return createFastRegisterAllocator(onlyAllocateSGPRs, false);
1767}
1768
1769FunctionPass *GCNPassConfig::createVGPRAllocPass(bool Optimized) {
1770 // Initialize the global default.
1771 llvm::call_once(InitializeDefaultVGPRRegisterAllocatorFlag,
1772 initializeDefaultVGPRRegisterAllocatorOnce);
1773
1774 RegisterRegAlloc::FunctionPassCtor Ctor = VGPRRegisterRegAlloc::getDefault();
1775 if (Ctor != useDefaultRegisterAllocator)
1776 return Ctor();
1777
1778 if (Optimized)
1779 return createGreedyVGPRRegisterAllocator();
1780
1781 return createFastVGPRRegisterAllocator();
1782}
1783
1784FunctionPass *GCNPassConfig::createWWMRegAllocPass(bool Optimized) {
1785 // Initialize the global default.
1786 llvm::call_once(InitializeDefaultWWMRegisterAllocatorFlag,
1787 initializeDefaultWWMRegisterAllocatorOnce);
1788
1789 RegisterRegAlloc::FunctionPassCtor Ctor = WWMRegisterRegAlloc::getDefault();
1790 if (Ctor != useDefaultRegisterAllocator)
1791 return Ctor();
1792
1793 if (Optimized)
1794 return createGreedyWWMRegisterAllocator();
1795
1796 return createFastWWMRegisterAllocator();
1797}
1798
1799FunctionPass *GCNPassConfig::createRegAllocPass(bool Optimized) {
1800 llvm_unreachable("should not be used");
1801}
1802
1804 "-regalloc not supported with amdgcn. Use -sgpr-regalloc, -wwm-regalloc, "
1805 "and -vgpr-regalloc";
1806
1807bool GCNPassConfig::addRegAssignAndRewriteFast() {
1808 if (!usingDefaultRegAlloc())
1810
1811 addPass(&GCNPreRALongBranchRegID);
1812
1813 addPass(createSGPRAllocPass(false));
1814
1815 // Equivalent of PEI for SGPRs.
1816 addPass(&SILowerSGPRSpillsLegacyID);
1817
1818 // To Allocate wwm registers used in whole quad mode operations (for shaders).
1820
1821 // For allocating other wwm register operands.
1822 addPass(createWWMRegAllocPass(false));
1823
1824 addPass(&SILowerWWMCopiesLegacyID);
1826
1827 // For allocating per-thread VGPRs.
1828 addPass(createVGPRAllocPass(false));
1829
1830 return true;
1831}
1832
1833bool GCNPassConfig::addRegAssignAndRewriteOptimized() {
1834 if (!usingDefaultRegAlloc())
1836
1837 addPass(&GCNPreRALongBranchRegID);
1838
1839 addPass(createSGPRAllocPass(true));
1840
1841 // Commit allocated register changes. This is mostly necessary because too
1842 // many things rely on the use lists of the physical registers, such as the
1843 // verifier. This is only necessary with allocators which use LiveIntervals,
1844 // since FastRegAlloc does the replacements itself.
1845 addPass(createVirtRegRewriter(false));
1846
1847 // At this point, the sgpr-regalloc has been done and it is good to have the
1848 // stack slot coloring to try to optimize the SGPR spill stack indices before
1849 // attempting the custom SGPR spill lowering.
1850 addPass(&StackSlotColoringID);
1851
1852 // Equivalent of PEI for SGPRs.
1853 addPass(&SILowerSGPRSpillsLegacyID);
1854
1855 // To Allocate wwm registers used in whole quad mode operations (for shaders).
1857
1858 // For allocating other whole wave mode registers.
1859 addPass(createWWMRegAllocPass(true));
1860 addPass(&SILowerWWMCopiesLegacyID);
1861 addPass(createVirtRegRewriter(false));
1863
1864 // For allocating per-thread VGPRs.
1865 addPass(createVGPRAllocPass(true));
1866
1867 addPreRewrite();
1868 addPass(&VirtRegRewriterID);
1869
1871
1872 return true;
1873}
1874
1875void GCNPassConfig::addPostRegAlloc() {
1876 addPass(&SIFixVGPRCopiesID);
1877 if (getOptLevel() > CodeGenOptLevel::None)
1880}
1881
1882void GCNPassConfig::addPreSched2() {
1883 if (TM->getOptLevel() > CodeGenOptLevel::None)
1885 addPass(&SIPostRABundlerLegacyID);
1886}
1887
1888void GCNPassConfig::addPreEmitPass() {
1889 if (isPassEnabled(EnableVOPD, CodeGenOptLevel::Less))
1890 addPass(&GCNCreateVOPDID);
1891 addPass(createSIMemoryLegalizerPass());
1892 addPass(createSIInsertWaitcntsPass());
1893
1894 addPass(createSIModeRegisterPass());
1895
1896 if (getOptLevel() > CodeGenOptLevel::None)
1897 addPass(&SIInsertHardClausesID);
1898
1900 if (isPassEnabled(EnableSetWavePriority, CodeGenOptLevel::Less))
1902 if (getOptLevel() > CodeGenOptLevel::None)
1903 addPass(&SIPreEmitPeepholeID);
1904 // The hazard recognizer that runs as part of the post-ra scheduler does not
1905 // guarantee to be able handle all hazards correctly. This is because if there
1906 // are multiple scheduling regions in a basic block, the regions are scheduled
1907 // bottom up, so when we begin to schedule a region we don't know what
1908 // instructions were emitted directly before it.
1909 //
1910 // Here we add a stand-alone hazard recognizer pass which can handle all
1911 // cases.
1912 addPass(&PostRAHazardRecognizerID);
1913
1915
1917
1918 if (isPassEnabled(EnableInsertDelayAlu, CodeGenOptLevel::Less))
1919 addPass(&AMDGPUInsertDelayAluID);
1920
1921 addPass(&BranchRelaxationPassID);
1922}
1923
1924void GCNPassConfig::addPostBBSections() {
1925 // We run this later to avoid passes like livedebugvalues and BBSections
1926 // having to deal with the apparent multi-entry functions we may generate.
1928}
1929
1931 return new GCNPassConfig(*this, PM);
1932}
1933
1939
1946
1950
1957
1960 SMDiagnostic &Error, SMRange &SourceRange) const {
1961 const yaml::SIMachineFunctionInfo &YamlMFI =
1962 static_cast<const yaml::SIMachineFunctionInfo &>(MFI_);
1963 MachineFunction &MF = PFS.MF;
1965 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1966
1967 if (MFI->initializeBaseYamlFields(YamlMFI, MF, PFS, Error, SourceRange))
1968 return true;
1969
1970 if (MFI->Occupancy == 0) {
1971 // Fixup the subtarget dependent default value.
1972 MFI->Occupancy = ST.getOccupancyWithWorkGroupSizes(MF).second;
1973 }
1974
1975 auto parseRegister = [&](const yaml::StringValue &RegName, Register &RegVal) {
1976 Register TempReg;
1977 if (parseNamedRegisterReference(PFS, TempReg, RegName.Value, Error)) {
1978 SourceRange = RegName.SourceRange;
1979 return true;
1980 }
1981 RegVal = TempReg;
1982
1983 return false;
1984 };
1985
1986 auto parseOptionalRegister = [&](const yaml::StringValue &RegName,
1987 Register &RegVal) {
1988 return !RegName.Value.empty() && parseRegister(RegName, RegVal);
1989 };
1990
1991 if (parseOptionalRegister(YamlMFI.VGPRForAGPRCopy, MFI->VGPRForAGPRCopy))
1992 return true;
1993
1994 if (parseOptionalRegister(YamlMFI.SGPRForEXECCopy, MFI->SGPRForEXECCopy))
1995 return true;
1996
1997 if (parseOptionalRegister(YamlMFI.LongBranchReservedReg,
1998 MFI->LongBranchReservedReg))
1999 return true;
2000
2001 auto diagnoseRegisterClass = [&](const yaml::StringValue &RegName) {
2002 // Create a diagnostic for a the register string literal.
2003 const MemoryBuffer &Buffer =
2004 *PFS.SM->getMemoryBuffer(PFS.SM->getMainFileID());
2005 Error = SMDiagnostic(*PFS.SM, SMLoc(), Buffer.getBufferIdentifier(), 1,
2006 RegName.Value.size(), SourceMgr::DK_Error,
2007 "incorrect register class for field", RegName.Value,
2008 {}, {});
2009 SourceRange = RegName.SourceRange;
2010 return true;
2011 };
2012
2013 if (parseRegister(YamlMFI.ScratchRSrcReg, MFI->ScratchRSrcReg) ||
2014 parseRegister(YamlMFI.FrameOffsetReg, MFI->FrameOffsetReg) ||
2015 parseRegister(YamlMFI.StackPtrOffsetReg, MFI->StackPtrOffsetReg))
2016 return true;
2017
2018 if (MFI->ScratchRSrcReg != AMDGPU::PRIVATE_RSRC_REG &&
2019 !AMDGPU::SGPR_128RegClass.contains(MFI->ScratchRSrcReg)) {
2020 return diagnoseRegisterClass(YamlMFI.ScratchRSrcReg);
2021 }
2022
2023 if (MFI->FrameOffsetReg != AMDGPU::FP_REG &&
2024 !AMDGPU::SGPR_32RegClass.contains(MFI->FrameOffsetReg)) {
2025 return diagnoseRegisterClass(YamlMFI.FrameOffsetReg);
2026 }
2027
2028 if (MFI->StackPtrOffsetReg != AMDGPU::SP_REG &&
2029 !AMDGPU::SGPR_32RegClass.contains(MFI->StackPtrOffsetReg)) {
2030 return diagnoseRegisterClass(YamlMFI.StackPtrOffsetReg);
2031 }
2032
2033 for (const auto &YamlReg : YamlMFI.WWMReservedRegs) {
2034 Register ParsedReg;
2035 if (parseRegister(YamlReg, ParsedReg))
2036 return true;
2037
2038 MFI->reserveWWMRegister(ParsedReg);
2039 }
2040
2041 for (const auto &[_, Info] : PFS.VRegInfosNamed) {
2042 MFI->setFlag(Info->VReg, Info->Flags);
2043 }
2044 for (const auto &[_, Info] : PFS.VRegInfos) {
2045 MFI->setFlag(Info->VReg, Info->Flags);
2046 }
2047
2048 for (const auto &YamlRegStr : YamlMFI.SpillPhysVGPRS) {
2049 Register ParsedReg;
2050 if (parseRegister(YamlRegStr, ParsedReg))
2051 return true;
2052 MFI->SpillPhysVGPRs.push_back(ParsedReg);
2053 }
2054
2055 auto parseAndCheckArgument = [&](const std::optional<yaml::SIArgument> &A,
2056 const TargetRegisterClass &RC,
2057 ArgDescriptor &Arg, unsigned UserSGPRs,
2058 unsigned SystemSGPRs) {
2059 // Skip parsing if it's not present.
2060 if (!A)
2061 return false;
2062
2063 if (A->IsRegister) {
2064 Register Reg;
2065 if (parseNamedRegisterReference(PFS, Reg, A->RegisterName.Value, Error)) {
2066 SourceRange = A->RegisterName.SourceRange;
2067 return true;
2068 }
2069 if (!RC.contains(Reg))
2070 return diagnoseRegisterClass(A->RegisterName);
2072 } else
2073 Arg = ArgDescriptor::createStack(A->StackOffset);
2074 // Check and apply the optional mask.
2075 if (A->Mask)
2076 Arg = ArgDescriptor::createArg(Arg, *A->Mask);
2077
2078 MFI->NumUserSGPRs += UserSGPRs;
2079 MFI->NumSystemSGPRs += SystemSGPRs;
2080 return false;
2081 };
2082
2083 if (YamlMFI.ArgInfo &&
2084 (parseAndCheckArgument(YamlMFI.ArgInfo->PrivateSegmentBuffer,
2085 AMDGPU::SGPR_128RegClass,
2086 MFI->ArgInfo.PrivateSegmentBuffer, 4, 0) ||
2087 parseAndCheckArgument(YamlMFI.ArgInfo->DispatchPtr,
2088 AMDGPU::SReg_64RegClass, MFI->ArgInfo.DispatchPtr,
2089 2, 0) ||
2090 parseAndCheckArgument(YamlMFI.ArgInfo->QueuePtr, AMDGPU::SReg_64RegClass,
2091 MFI->ArgInfo.QueuePtr, 2, 0) ||
2092 parseAndCheckArgument(YamlMFI.ArgInfo->KernargSegmentPtr,
2093 AMDGPU::SReg_64RegClass,
2094 MFI->ArgInfo.KernargSegmentPtr, 2, 0) ||
2095 parseAndCheckArgument(YamlMFI.ArgInfo->DispatchID,
2096 AMDGPU::SReg_64RegClass, MFI->ArgInfo.DispatchID,
2097 2, 0) ||
2098 parseAndCheckArgument(YamlMFI.ArgInfo->FlatScratchInit,
2099 AMDGPU::SReg_64RegClass,
2100 MFI->ArgInfo.FlatScratchInit, 2, 0) ||
2101 parseAndCheckArgument(YamlMFI.ArgInfo->PrivateSegmentSize,
2102 AMDGPU::SGPR_32RegClass,
2103 MFI->ArgInfo.PrivateSegmentSize, 0, 0) ||
2104 parseAndCheckArgument(YamlMFI.ArgInfo->LDSKernelId,
2105 AMDGPU::SGPR_32RegClass,
2106 MFI->ArgInfo.LDSKernelId, 0, 1) ||
2107 parseAndCheckArgument(YamlMFI.ArgInfo->WorkGroupIDX,
2108 AMDGPU::SGPR_32RegClass, MFI->ArgInfo.WorkGroupIDX,
2109 0, 1) ||
2110 parseAndCheckArgument(YamlMFI.ArgInfo->WorkGroupIDY,
2111 AMDGPU::SGPR_32RegClass, MFI->ArgInfo.WorkGroupIDY,
2112 0, 1) ||
2113 parseAndCheckArgument(YamlMFI.ArgInfo->WorkGroupIDZ,
2114 AMDGPU::SGPR_32RegClass, MFI->ArgInfo.WorkGroupIDZ,
2115 0, 1) ||
2116 parseAndCheckArgument(YamlMFI.ArgInfo->WorkGroupInfo,
2117 AMDGPU::SGPR_32RegClass,
2118 MFI->ArgInfo.WorkGroupInfo, 0, 1) ||
2119 parseAndCheckArgument(YamlMFI.ArgInfo->PrivateSegmentWaveByteOffset,
2120 AMDGPU::SGPR_32RegClass,
2121 MFI->ArgInfo.PrivateSegmentWaveByteOffset, 0, 1) ||
2122 parseAndCheckArgument(YamlMFI.ArgInfo->ImplicitArgPtr,
2123 AMDGPU::SReg_64RegClass,
2124 MFI->ArgInfo.ImplicitArgPtr, 0, 0) ||
2125 parseAndCheckArgument(YamlMFI.ArgInfo->ImplicitBufferPtr,
2126 AMDGPU::SReg_64RegClass,
2127 MFI->ArgInfo.ImplicitBufferPtr, 2, 0) ||
2128 parseAndCheckArgument(YamlMFI.ArgInfo->WorkItemIDX,
2129 AMDGPU::VGPR_32RegClass,
2130 MFI->ArgInfo.WorkItemIDX, 0, 0) ||
2131 parseAndCheckArgument(YamlMFI.ArgInfo->WorkItemIDY,
2132 AMDGPU::VGPR_32RegClass,
2133 MFI->ArgInfo.WorkItemIDY, 0, 0) ||
2134 parseAndCheckArgument(YamlMFI.ArgInfo->WorkItemIDZ,
2135 AMDGPU::VGPR_32RegClass,
2136 MFI->ArgInfo.WorkItemIDZ, 0, 0)))
2137 return true;
2138
2139 // Parse FirstKernArgPreloadReg separately, since it's a Register,
2140 // not ArgDescriptor.
2141 if (YamlMFI.ArgInfo && YamlMFI.ArgInfo->FirstKernArgPreloadReg) {
2142 const yaml::SIArgument &A = *YamlMFI.ArgInfo->FirstKernArgPreloadReg;
2143
2144 if (!A.IsRegister) {
2145 // For stack arguments, we don't have RegisterName.SourceRange,
2146 // but we should have some location info from the YAML parser
2147 const MemoryBuffer &Buffer =
2148 *PFS.SM->getMemoryBuffer(PFS.SM->getMainFileID());
2149 // Create a minimal valid source range
2151 SMRange Range(Loc, Loc);
2152
2154 *PFS.SM, Loc, Buffer.getBufferIdentifier(), 1, 0, SourceMgr::DK_Error,
2155 "firstKernArgPreloadReg must be a register, not a stack location", "",
2156 {}, {});
2157
2158 SourceRange = Range;
2159 return true;
2160 }
2161
2162 Register Reg;
2163 if (parseNamedRegisterReference(PFS, Reg, A.RegisterName.Value, Error)) {
2164 SourceRange = A.RegisterName.SourceRange;
2165 return true;
2166 }
2167
2168 if (!AMDGPU::SGPR_32RegClass.contains(Reg))
2169 return diagnoseRegisterClass(A.RegisterName);
2170
2171 MFI->ArgInfo.FirstKernArgPreloadReg = Reg;
2172 MFI->NumUserSGPRs += YamlMFI.NumKernargPreloadSGPRs;
2173 }
2174
2175 if (ST.hasFeature(AMDGPU::FeatureDX10ClampAndIEEEMode)) {
2176 MFI->Mode.IEEE = YamlMFI.Mode.IEEE;
2177 MFI->Mode.DX10Clamp = YamlMFI.Mode.DX10Clamp;
2178 }
2179
2180 // FIXME: Move proper support for denormal-fp-math into base MachineFunction
2181 MFI->Mode.FP32Denormals.Input = YamlMFI.Mode.FP32InputDenormals
2184 MFI->Mode.FP32Denormals.Output = YamlMFI.Mode.FP32OutputDenormals
2187
2194
2195 if (YamlMFI.HasInitWholeWave)
2196 MFI->setInitWholeWave();
2197
2198 return false;
2199}
2200
2201//===----------------------------------------------------------------------===//
2202// AMDGPU CodeGen Pass Builder interface.
2203//===----------------------------------------------------------------------===//
2204
2205AMDGPUCodeGenPassBuilder::AMDGPUCodeGenPassBuilder(
2206 GCNTargetMachine &TM, const CGPassBuilderOption &Opts,
2208 : CodeGenPassBuilder(TM, Opts, PIC) {
2209 Opt.MISchedPostRA = true;
2210 Opt.RequiresCodeGenSCCOrder = true;
2211 // Exceptions and StackMaps are not supported, so these passes will never do
2212 // anything.
2213 // Garbage collection is not supported.
2214 disablePass<StackMapLivenessPass, FuncletLayoutPass, PatchableFunctionPass,
2216}
2217
2218void AMDGPUCodeGenPassBuilder::addIRPasses(PassManagerWrapper &PMW) const {
2219 if (RemoveIncompatibleFunctions && TM.getTargetTriple().isAMDGCN()) {
2220 flushFPMsToMPM(PMW);
2221 addModulePass(AMDGPURemoveIncompatibleFunctionsPass(TM), PMW);
2222 }
2223
2224 flushFPMsToMPM(PMW);
2225
2226 if (TM.getTargetTriple().isAMDGCN())
2227 addModulePass(AMDGPUPrintfRuntimeBindingPass(), PMW);
2228
2229 if (LowerCtorDtor)
2230 addModulePass(AMDGPUCtorDtorLoweringPass(), PMW);
2231
2232 if (isPassEnabled(EnableImageIntrinsicOptimizer))
2233 addFunctionPass(AMDGPUImageIntrinsicOptimizerPass(TM), PMW);
2234
2236 addFunctionPass(AMDGPUUniformIntrinsicCombinePass(), PMW);
2237 // This can be disabled by passing ::Disable here or on the command line
2238 // with --expand-variadics-override=disable.
2239 flushFPMsToMPM(PMW);
2241
2242 addModulePass(AMDGPUAlwaysInlinePass(), PMW);
2243 addModulePass(AlwaysInlinerPass(), PMW);
2244
2245 addModulePass(AMDGPUExportKernelRuntimeHandlesPass(), PMW);
2246
2248 addModulePass(AMDGPULowerExecSyncPass(), PMW);
2249
2250 if (EnableSwLowerLDS)
2251 addModulePass(AMDGPUSwLowerLDSPass(TM), PMW);
2252
2253 // Runs before PromoteAlloca so the latter can account for function uses
2255 addModulePass(AMDGPULowerModuleLDSPass(TM), PMW);
2256
2257 // Run atomic optimizer before Atomic Expand
2258 if (TM.getOptLevel() >= CodeGenOptLevel::Less &&
2260 addFunctionPass(
2262
2263 addFunctionPass(AtomicExpandPass(TM), PMW);
2264
2265 if (TM.getOptLevel() > CodeGenOptLevel::None) {
2266 addFunctionPass(AMDGPUPromoteAllocaPass(TM), PMW);
2267 if (isPassEnabled(EnableScalarIRPasses))
2268 addStraightLineScalarOptimizationPasses(PMW);
2269
2270 // TODO: Handle EnableAMDGPUAliasAnalysis
2271
2272 // TODO: May want to move later or split into an early and late one.
2273 addFunctionPass(AMDGPUCodeGenPreparePass(TM), PMW);
2274
2275 // Try to hoist loop invariant parts of divisions AMDGPUCodeGenPrepare may
2276 // have expanded.
2277 if (TM.getOptLevel() > CodeGenOptLevel::Less) {
2279 /*UseMemorySSA=*/true),
2280 PMW);
2281 }
2282 }
2283
2284 Base::addIRPasses(PMW);
2285
2286 // EarlyCSE is not always strong enough to clean up what LSR produces. For
2287 // example, GVN can combine
2288 //
2289 // %0 = add %a, %b
2290 // %1 = add %b, %a
2291 //
2292 // and
2293 //
2294 // %0 = shl nsw %a, 2
2295 // %1 = shl %a, 2
2296 //
2297 // but EarlyCSE can do neither of them.
2298 if (isPassEnabled(EnableScalarIRPasses))
2299 addEarlyCSEOrGVNPass(PMW);
2300}
2301
2302void AMDGPUCodeGenPassBuilder::addCodeGenPrepare(
2303 PassManagerWrapper &PMW) const {
2304 if (TM.getOptLevel() > CodeGenOptLevel::None) {
2305 flushFPMsToMPM(PMW);
2306 addModulePass(AMDGPUPreloadKernelArgumentsPass(TM), PMW);
2307 }
2308
2310 addFunctionPass(AMDGPULowerKernelArgumentsPass(TM), PMW);
2311
2312 Base::addCodeGenPrepare(PMW);
2313
2314 if (isPassEnabled(EnableLoadStoreVectorizer))
2315 addFunctionPass(LoadStoreVectorizerPass(), PMW);
2316
2317 // This lowering has been placed after codegenprepare to take advantage of
2318 // address mode matching (which is why it isn't put with the LDS lowerings).
2319 // It could be placed anywhere before uniformity annotations (an analysis
2320 // that it changes by splitting up fat pointers into their components)
2321 // but has been put before switch lowering and CFG flattening so that those
2322 // passes can run on the more optimized control flow this pass creates in
2323 // many cases.
2324 flushFPMsToMPM(PMW);
2325 addModulePass(AMDGPULowerBufferFatPointersPass(TM), PMW);
2326 flushFPMsToMPM(PMW);
2327 requireCGSCCOrder(PMW);
2328
2329 addModulePass(AMDGPULowerIntrinsicsPass(TM), PMW);
2330
2331 // LowerSwitch pass may introduce unreachable blocks that can cause unexpected
2332 // behavior for subsequent passes. Placing it here seems better that these
2333 // blocks would get cleaned up by UnreachableBlockElim inserted next in the
2334 // pass flow.
2335 addFunctionPass(LowerSwitchPass(), PMW);
2336}
2337
2338void AMDGPUCodeGenPassBuilder::addPreISel(PassManagerWrapper &PMW) const {
2339
2340 if (TM.getOptLevel() > CodeGenOptLevel::None) {
2341 addFunctionPass(FlattenCFGPass(), PMW);
2342 addFunctionPass(SinkingPass(), PMW);
2343 addFunctionPass(AMDGPULateCodeGenPreparePass(TM), PMW);
2344 }
2345
2346 // Merge divergent exit nodes. StructurizeCFG won't recognize the multi-exit
2347 // regions formed by them.
2348
2349 addFunctionPass(AMDGPUUnifyDivergentExitNodesPass(), PMW);
2350 addFunctionPass(FixIrreduciblePass(), PMW);
2351 addFunctionPass(UnifyLoopExitsPass(), PMW);
2352 addFunctionPass(StructurizeCFGPass(/*SkipUniformRegions=*/false), PMW);
2353
2354 addFunctionPass(AMDGPUAnnotateUniformValuesPass(), PMW);
2355
2356 addFunctionPass(SIAnnotateControlFlowPass(TM), PMW);
2357
2358 // TODO: Move this right after structurizeCFG to avoid extra divergence
2359 // analysis. This depends on stopping SIAnnotateControlFlow from making
2360 // control flow modifications.
2361 addFunctionPass(AMDGPURewriteUndefForPHIPass(), PMW);
2362
2364 !isGlobalISelAbortEnabled() || !NewRegBankSelect)
2365 addFunctionPass(LCSSAPass(), PMW);
2366
2367 if (TM.getOptLevel() > CodeGenOptLevel::Less) {
2368 flushFPMsToMPM(PMW);
2369 addModulePass(AMDGPUPerfHintAnalysisPass(TM), PMW);
2370 }
2371
2372 // FIXME: Why isn't this queried as required from AMDGPUISelDAGToDAG, and why
2373 // isn't this in addInstSelector?
2375 /*Force=*/true);
2376}
2377
2378void AMDGPUCodeGenPassBuilder::addILPOpts(PassManagerWrapper &PMW) const {
2380 addMachineFunctionPass(EarlyIfConverterPass(), PMW);
2381
2382 Base::addILPOpts(PMW);
2383}
2384
2385void AMDGPUCodeGenPassBuilder::addAsmPrinterBegin(
2386 PassManagerWrapper &PMW) const {
2387 // TODO: Add AsmPrinterBegin
2388}
2389
2390void AMDGPUCodeGenPassBuilder::addAsmPrinter(PassManagerWrapper &PMW) const {
2391 // TODO: Add AsmPrinter.
2392}
2393
2394void AMDGPUCodeGenPassBuilder::addAsmPrinterEnd(PassManagerWrapper &PMW) const {
2395 // TODO: Add AsmPrinterEnd
2396}
2397
2398Error AMDGPUCodeGenPassBuilder::addInstSelector(PassManagerWrapper &PMW) const {
2399 addMachineFunctionPass(AMDGPUISelDAGToDAGPass(TM), PMW);
2400 addMachineFunctionPass(SIFixSGPRCopiesPass(), PMW);
2401 addMachineFunctionPass(SILowerI1CopiesPass(), PMW);
2402 return Error::success();
2403}
2404
2405void AMDGPUCodeGenPassBuilder::addPreRewrite(PassManagerWrapper &PMW) const {
2406 if (EnableRegReassign) {
2407 addMachineFunctionPass(GCNNSAReassignPass(), PMW);
2408 }
2409
2410 addMachineFunctionPass(AMDGPURewriteAGPRCopyMFMAPass(), PMW);
2411}
2412
2413void AMDGPUCodeGenPassBuilder::addMachineSSAOptimization(
2414 PassManagerWrapper &PMW) const {
2415 Base::addMachineSSAOptimization(PMW);
2416
2417 addMachineFunctionPass(SIFoldOperandsPass(), PMW);
2418 if (EnableDPPCombine) {
2419 addMachineFunctionPass(GCNDPPCombinePass(), PMW);
2420 }
2421 addMachineFunctionPass(SILoadStoreOptimizerPass(), PMW);
2422 if (isPassEnabled(EnableSDWAPeephole)) {
2423 addMachineFunctionPass(SIPeepholeSDWAPass(), PMW);
2424 addMachineFunctionPass(EarlyMachineLICMPass(), PMW);
2425 addMachineFunctionPass(MachineCSEPass(), PMW);
2426 addMachineFunctionPass(SIFoldOperandsPass(), PMW);
2427 }
2428 addMachineFunctionPass(DeadMachineInstructionElimPass(), PMW);
2429 addMachineFunctionPass(SIShrinkInstructionsPass(), PMW);
2430}
2431
2432Error AMDGPUCodeGenPassBuilder::addFastRegAlloc(PassManagerWrapper &PMW) const {
2433 insertPass<PHIEliminationPass>(SILowerControlFlowPass());
2434
2435 insertPass<TwoAddressInstructionPass>(SIWholeQuadModePass());
2436
2437 return Base::addFastRegAlloc(PMW);
2438}
2439
2440Error AMDGPUCodeGenPassBuilder::addRegAssignmentFast(
2441 PassManagerWrapper &PMW) const {
2442 if (auto Err = validateRegAllocOptions())
2443 return Err;
2444
2445 addMachineFunctionPass(GCNPreRALongBranchRegPass(), PMW);
2446
2447 // SGPR allocation - default to fast at -O0.
2448 if (SGPRRegAllocNPM == RegAllocType::Greedy)
2449 addMachineFunctionPass(RAGreedyPass({onlyAllocateSGPRs, "sgpr"}), PMW);
2450 else
2451 addMachineFunctionPass(RegAllocFastPass({onlyAllocateSGPRs, "sgpr", false}),
2452 PMW);
2453
2454 // Equivalent of PEI for SGPRs.
2455 addMachineFunctionPass(SILowerSGPRSpillsPass(), PMW);
2456
2457 // To Allocate wwm registers used in whole quad mode operations (for shaders).
2458 addMachineFunctionPass(SIPreAllocateWWMRegsPass(), PMW);
2459
2460 // WWM allocation - default to fast at -O0.
2461 if (WWMRegAllocNPM == RegAllocType::Greedy)
2462 addMachineFunctionPass(RAGreedyPass({onlyAllocateWWMRegs, "wwm"}), PMW);
2463 else
2464 addMachineFunctionPass(
2465 RegAllocFastPass({onlyAllocateWWMRegs, "wwm", false}), PMW);
2466
2467 addMachineFunctionPass(SILowerWWMCopiesPass(), PMW);
2468 addMachineFunctionPass(AMDGPUReserveWWMRegsPass(), PMW);
2469
2470 // VGPR allocation - default to fast at -O0.
2471 if (VGPRRegAllocNPM == RegAllocType::Greedy)
2472 addMachineFunctionPass(RAGreedyPass({onlyAllocateVGPRs, "vgpr"}), PMW);
2473 else
2474 addMachineFunctionPass(RegAllocFastPass({onlyAllocateVGPRs, "vgpr"}), PMW);
2475
2476 return Error::success();
2477}
2478
2479Error AMDGPUCodeGenPassBuilder::addOptimizedRegAlloc(
2480 PassManagerWrapper &PMW) const {
2481 if (EnableDCEInRA)
2482 insertPass<DetectDeadLanesPass>(DeadMachineInstructionElimPass());
2483
2484 // FIXME: when an instruction has a Killed operand, and the instruction is
2485 // inside a bundle, seems only the BUNDLE instruction appears as the Kills of
2486 // the register in LiveVariables, this would trigger a failure in verifier,
2487 // we should fix it and enable the verifier.
2488 if (OptVGPRLiveRange)
2489 insertPass<RequireAnalysisPass<LiveVariablesAnalysis, MachineFunction>>(
2491
2492 // This must be run immediately after phi elimination and before
2493 // TwoAddressInstructions, otherwise the processing of the tied operand of
2494 // SI_ELSE will introduce a copy of the tied operand source after the else.
2495 insertPass<PHIEliminationPass>(SILowerControlFlowPass());
2496
2498 insertPass<RenameIndependentSubregsPass>(GCNRewritePartialRegUsesPass());
2499
2500 if (isPassEnabled(EnablePreRAOptimizations))
2501 insertPass<MachineSchedulerPass>(GCNPreRAOptimizationsPass());
2502
2503 // Allow the scheduler to run before SIWholeQuadMode inserts exec manipulation
2504 // instructions that cause scheduling barriers.
2505 insertPass<MachineSchedulerPass>(SIWholeQuadModePass());
2506
2507 if (OptExecMaskPreRA)
2508 insertPass<MachineSchedulerPass>(SIOptimizeExecMaskingPreRAPass());
2509
2510 // This is not an essential optimization and it has a noticeable impact on
2511 // compilation time, so we only enable it from O2.
2512 if (TM.getOptLevel() > CodeGenOptLevel::Less)
2513 insertPass<MachineSchedulerPass>(SIFormMemoryClausesPass());
2514
2515 return Base::addOptimizedRegAlloc(PMW);
2516}
2517
2518void AMDGPUCodeGenPassBuilder::addPreRegAlloc(PassManagerWrapper &PMW) const {
2519 if (getOptLevel() != CodeGenOptLevel::None)
2520 addMachineFunctionPass(AMDGPUPrepareAGPRAllocPass(), PMW);
2521}
2522
2523Error AMDGPUCodeGenPassBuilder::addRegAssignmentOptimized(
2524 PassManagerWrapper &PMW) const {
2525 if (auto Err = validateRegAllocOptions())
2526 return Err;
2527
2528 addMachineFunctionPass(GCNPreRALongBranchRegPass(), PMW);
2529
2530 // SGPR allocation - default to greedy at -O1 and above.
2531 if (SGPRRegAllocNPM == RegAllocType::Fast)
2532 addMachineFunctionPass(RegAllocFastPass({onlyAllocateSGPRs, "sgpr", false}),
2533 PMW);
2534 else
2535 addMachineFunctionPass(RAGreedyPass({onlyAllocateSGPRs, "sgpr"}), PMW);
2536
2537 // Commit allocated register changes. This is mostly necessary because too
2538 // many things rely on the use lists of the physical registers, such as the
2539 // verifier. This is only necessary with allocators which use LiveIntervals,
2540 // since FastRegAlloc does the replacements itself.
2541 addMachineFunctionPass(VirtRegRewriterPass(false), PMW);
2542
2543 // At this point, the sgpr-regalloc has been done and it is good to have the
2544 // stack slot coloring to try to optimize the SGPR spill stack indices before
2545 // attempting the custom SGPR spill lowering.
2546 addMachineFunctionPass(StackSlotColoringPass(), PMW);
2547
2548 // Equivalent of PEI for SGPRs.
2549 addMachineFunctionPass(SILowerSGPRSpillsPass(), PMW);
2550
2551 // To Allocate wwm registers used in whole quad mode operations (for shaders).
2552 addMachineFunctionPass(SIPreAllocateWWMRegsPass(), PMW);
2553
2554 // WWM allocation - default to greedy at -O1 and above.
2555 if (WWMRegAllocNPM == RegAllocType::Fast)
2556 addMachineFunctionPass(
2557 RegAllocFastPass({onlyAllocateWWMRegs, "wwm", false}), PMW);
2558 else
2559 addMachineFunctionPass(RAGreedyPass({onlyAllocateWWMRegs, "wwm"}), PMW);
2560 addMachineFunctionPass(SILowerWWMCopiesPass(), PMW);
2561 addMachineFunctionPass(VirtRegRewriterPass(false), PMW);
2562 addMachineFunctionPass(AMDGPUReserveWWMRegsPass(), PMW);
2563
2564 // VGPR allocation - default to greedy at -O1 and above.
2565 if (VGPRRegAllocNPM == RegAllocType::Fast)
2566 addMachineFunctionPass(RegAllocFastPass({onlyAllocateVGPRs, "vgpr"}), PMW);
2567 else
2568 addMachineFunctionPass(RAGreedyPass({onlyAllocateVGPRs, "vgpr"}), PMW);
2569
2570 addPreRewrite(PMW);
2571 addMachineFunctionPass(VirtRegRewriterPass(true), PMW);
2572
2573 addMachineFunctionPass(AMDGPUMarkLastScratchLoadPass(), PMW);
2574 return Error::success();
2575}
2576
2577void AMDGPUCodeGenPassBuilder::addPostRegAlloc(PassManagerWrapper &PMW) const {
2578 addMachineFunctionPass(SIFixVGPRCopiesPass(), PMW);
2579 if (TM.getOptLevel() > CodeGenOptLevel::None)
2580 addMachineFunctionPass(SIOptimizeExecMaskingPass(), PMW);
2581 Base::addPostRegAlloc(PMW);
2582}
2583
2584void AMDGPUCodeGenPassBuilder::addPreSched2(PassManagerWrapper &PMW) const {
2585 if (TM.getOptLevel() > CodeGenOptLevel::None)
2586 addMachineFunctionPass(SIShrinkInstructionsPass(), PMW);
2587 addMachineFunctionPass(SIPostRABundlerPass(), PMW);
2588}
2589
2590void AMDGPUCodeGenPassBuilder::addPostBBSections(
2591 PassManagerWrapper &PMW) const {
2592 // We run this later to avoid passes like livedebugvalues and BBSections
2593 // having to deal with the apparent multi-entry functions we may generate.
2594 addMachineFunctionPass(AMDGPUPreloadKernArgPrologPass(), PMW);
2595}
2596
2597void AMDGPUCodeGenPassBuilder::addPreEmitPass(PassManagerWrapper &PMW) const {
2598 if (isPassEnabled(EnableVOPD, CodeGenOptLevel::Less)) {
2599 addMachineFunctionPass(GCNCreateVOPDPass(), PMW);
2600 }
2601
2602 addMachineFunctionPass(SIMemoryLegalizerPass(), PMW);
2603 addMachineFunctionPass(SIInsertWaitcntsPass(), PMW);
2604
2605 addMachineFunctionPass(SIModeRegisterPass(), PMW);
2606
2607 if (TM.getOptLevel() > CodeGenOptLevel::None)
2608 addMachineFunctionPass(SIInsertHardClausesPass(), PMW);
2609
2610 addMachineFunctionPass(SILateBranchLoweringPass(), PMW);
2611
2612 if (isPassEnabled(EnableSetWavePriority, CodeGenOptLevel::Less))
2613 addMachineFunctionPass(AMDGPUSetWavePriorityPass(), PMW);
2614
2615 if (TM.getOptLevel() > CodeGenOptLevel::None)
2616 addMachineFunctionPass(SIPreEmitPeepholePass(), PMW);
2617
2618 // The hazard recognizer that runs as part of the post-ra scheduler does not
2619 // guarantee to be able handle all hazards correctly. This is because if there
2620 // are multiple scheduling regions in a basic block, the regions are scheduled
2621 // bottom up, so when we begin to schedule a region we don't know what
2622 // instructions were emitted directly before it.
2623 //
2624 // Here we add a stand-alone hazard recognizer pass which can handle all
2625 // cases.
2626 addMachineFunctionPass(PostRAHazardRecognizerPass(), PMW);
2627 addMachineFunctionPass(AMDGPUWaitSGPRHazardsPass(), PMW);
2628 addMachineFunctionPass(AMDGPULowerVGPREncodingPass(), PMW);
2629
2630 if (isPassEnabled(EnableInsertDelayAlu, CodeGenOptLevel::Less)) {
2631 addMachineFunctionPass(AMDGPUInsertDelayAluPass(), PMW);
2632 }
2633
2634 addMachineFunctionPass(BranchRelaxationPass(), PMW);
2635}
2636
2637bool AMDGPUCodeGenPassBuilder::isPassEnabled(const cl::opt<bool> &Opt,
2638 CodeGenOptLevel Level) const {
2639 if (Opt.getNumOccurrences())
2640 return Opt;
2641 if (TM.getOptLevel() < Level)
2642 return false;
2643 return Opt;
2644}
2645
2646void AMDGPUCodeGenPassBuilder::addEarlyCSEOrGVNPass(
2647 PassManagerWrapper &PMW) const {
2648 if (TM.getOptLevel() == CodeGenOptLevel::Aggressive)
2649 addFunctionPass(GVNPass(), PMW);
2650 else
2651 addFunctionPass(EarlyCSEPass(), PMW);
2652}
2653
2654void AMDGPUCodeGenPassBuilder::addStraightLineScalarOptimizationPasses(
2655 PassManagerWrapper &PMW) const {
2657 addFunctionPass(LoopDataPrefetchPass(), PMW);
2658
2659 addFunctionPass(SeparateConstOffsetFromGEPPass(), PMW);
2660
2661 // ReassociateGEPs exposes more opportunities for SLSR. See
2662 // the example in reassociate-geps-and-slsr.ll.
2663 addFunctionPass(StraightLineStrengthReducePass(), PMW);
2664
2665 // SeparateConstOffsetFromGEP and SLSR creates common expressions which GVN or
2666 // EarlyCSE can reuse.
2667 addEarlyCSEOrGVNPass(PMW);
2668
2669 // Run NaryReassociate after EarlyCSE/GVN to be more effective.
2670 addFunctionPass(NaryReassociatePass(), PMW);
2671
2672 // NaryReassociate on GEPs creates redundant common expressions, so run
2673 // EarlyCSE after it.
2674 addFunctionPass(EarlyCSEPass(), PMW);
2675}
aarch64 falkor hwpf fix Falkor HW Prefetch Fix Late Phase
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
static cl::opt< bool > EnableEarlyIfConversion("aarch64-enable-early-ifcvt", cl::Hidden, cl::desc("Run early if-conversion"), cl::init(true))
static std::unique_ptr< TargetLoweringObjectFile > createTLOF(const Triple &TT)
This is the AMGPU address space based alias analysis pass.
Coexecution-focused scheduling strategy for AMDGPU.
Defines an instruction selector for the AMDGPU target.
Analyzes if a function potentially memory bound and if a kernel kernel may benefit from limiting numb...
Analyzes how many registers and other resources are used by functions.
static cl::opt< bool > EnableDCEInRA("amdgpu-dce-in-ra", cl::init(true), cl::Hidden, cl::desc("Enable machine DCE inside regalloc"))
static cl::opt< bool, true > EnableLowerModuleLDS("amdgpu-enable-lower-module-lds", cl::desc("Enable lower module lds pass"), cl::location(AMDGPUTargetMachine::EnableLowerModuleLDS), cl::init(true), cl::Hidden)
static MachineSchedRegistry GCNMaxMemoryClauseSchedRegistry("gcn-max-memory-clause", "Run GCN scheduler to maximize memory clause", createGCNMaxMemoryClauseMachineScheduler)
static Reloc::Model getEffectiveRelocModel()
static cl::opt< bool > EnableUniformIntrinsicCombine("amdgpu-enable-uniform-intrinsic-combine", cl::desc("Enable/Disable the Uniform Intrinsic Combine Pass"), cl::init(true), cl::Hidden)
static MachineSchedRegistry SISchedRegistry("si", "Run SI's custom scheduler", createSIMachineScheduler)
static ScheduleDAGInstrs * createIterativeILPMachineScheduler(MachineSchedContext *C)
static cl::opt< bool > EarlyInlineAll("amdgpu-early-inline-all", cl::desc("Inline all functions early"), cl::init(false), cl::Hidden)
static cl::opt< bool > EnableSwLowerLDS("amdgpu-enable-sw-lower-lds", cl::desc("Enable lowering of lds to global memory pass " "and asan instrument resulting IR."), cl::init(true), cl::Hidden)
static cl::opt< bool > EnableLowerKernelArguments("amdgpu-ir-lower-kernel-arguments", cl::desc("Lower kernel argument loads in IR pass"), cl::init(true), cl::Hidden)
static ScheduleDAGInstrs * createGCNMaxILPMachineScheduler(MachineSchedContext *C)
static cl::opt< bool > EnableSDWAPeephole("amdgpu-sdwa-peephole", cl::desc("Enable SDWA peepholer"), cl::init(true))
static MachineSchedRegistry GCNMinRegSchedRegistry("gcn-iterative-minreg", "Run GCN iterative scheduler for minimal register usage (experimental)", createMinRegScheduler)
static void diagnoseUnsupportedCoExecSchedulerSelection(const Function &F, const GCNSubtarget &ST)
static cl::opt< bool > EnableImageIntrinsicOptimizer("amdgpu-enable-image-intrinsic-optimizer", cl::desc("Enable image intrinsic optimizer pass"), cl::init(true), cl::Hidden)
static cl::opt< bool > HasClosedWorldAssumption("amdgpu-link-time-closed-world", cl::desc("Whether has closed-world assumption at link time"), cl::init(false), cl::Hidden)
static bool useNoopPostScheduler(const Function &F)
static ScheduleDAGInstrs * createGCNMaxMemoryClauseMachineScheduler(MachineSchedContext *C)
static cl::opt< bool > EnableSIModeRegisterPass("amdgpu-mode-register", cl::desc("Enable mode register pass"), cl::init(true), cl::Hidden)
static cl::opt< std::string > AMDGPUSchedStrategy("amdgpu-sched-strategy", cl::desc("Select custom AMDGPU scheduling strategy."), cl::Hidden, cl::init(""))
static cl::opt< bool > EnableDPPCombine("amdgpu-dpp-combine", cl::desc("Enable DPP combiner"), cl::init(true))
static MachineSchedRegistry IterativeGCNMaxOccupancySchedRegistry("gcn-iterative-max-occupancy-experimental", "Run GCN scheduler to maximize occupancy (experimental)", createIterativeGCNMaxOccupancyMachineScheduler)
static cl::opt< bool > EnableSetWavePriority("amdgpu-set-wave-priority", cl::desc("Adjust wave priority"), cl::init(false), cl::Hidden)
static cl::opt< bool > LowerCtorDtor("amdgpu-lower-global-ctor-dtor", cl::desc("Lower GPU ctor / dtors to globals on the device."), cl::init(true), cl::Hidden)
static cl::opt< bool > OptExecMaskPreRA("amdgpu-opt-exec-mask-pre-ra", cl::Hidden, cl::desc("Run pre-RA exec mask optimizations"), cl::init(true))
static cl::opt< bool > EnablePromoteKernelArguments("amdgpu-enable-promote-kernel-arguments", cl::desc("Enable promotion of flat kernel pointer arguments to global"), cl::Hidden, cl::init(true))
LLVM_ABI LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget()
static cl::opt< bool > EnableRewritePartialRegUses("amdgpu-enable-rewrite-partial-reg-uses", cl::desc("Enable rewrite partial reg uses pass"), cl::init(true), cl::Hidden)
static cl::opt< bool > EnableLibCallSimplify("amdgpu-simplify-libcall", cl::desc("Enable amdgpu library simplifications"), cl::init(true), cl::Hidden)
static MachineSchedRegistry GCNMaxILPSchedRegistry("gcn-max-ilp", "Run GCN scheduler to maximize ilp", createGCNMaxILPMachineScheduler)
static cl::opt< bool > InternalizeSymbols("amdgpu-internalize-symbols", cl::desc("Enable elimination of non-kernel functions and unused globals"), cl::init(false), cl::Hidden)
static cl::opt< bool > EnableAMDGPUAttributor("amdgpu-attributor-enable", cl::desc("Enable AMDGPUAttributorPass"), cl::init(true), cl::Hidden)
static LLVM_READNONE StringRef getGPUOrDefault(const Triple &TT, StringRef GPU)
Expected< AMDGPUAttributorOptions > parseAMDGPUAttributorPassOptions(StringRef Params)
static cl::opt< bool > EnableAMDGPUAliasAnalysis("enable-amdgpu-aa", cl::Hidden, cl::desc("Enable AMDGPU Alias Analysis"), cl::init(true))
static Expected< ScanOptions > parseAMDGPUAtomicOptimizerStrategy(StringRef Params)
static ScheduleDAGInstrs * createMinRegScheduler(MachineSchedContext *C)
static cl::opt< bool > EnableHipStdPar("amdgpu-enable-hipstdpar", cl::desc("Enable HIP Standard Parallelism Offload support"), cl::init(false), cl::Hidden)
static cl::opt< bool > EnableInsertDelayAlu("amdgpu-enable-delay-alu", cl::desc("Enable s_delay_alu insertion"), cl::init(true), cl::Hidden)
static ScheduleDAGInstrs * createIterativeGCNMaxOccupancyMachineScheduler(MachineSchedContext *C)
static cl::opt< bool > EnableLoadStoreVectorizer("amdgpu-load-store-vectorizer", cl::desc("Enable load store vectorizer"), cl::init(true), cl::Hidden)
static bool mustPreserveGV(const GlobalValue &GV)
Predicate for Internalize pass.
static cl::opt< bool > EnableLoopPrefetch("amdgpu-loop-prefetch", cl::desc("Enable loop data prefetch on AMDGPU"), cl::Hidden, cl::init(false))
static cl::opt< bool > NewRegBankSelect("new-reg-bank-select", cl::desc("Run amdgpu-regbankselect and amdgpu-regbanklegalize instead of " "regbankselect"), cl::init(false), cl::Hidden)
static cl::opt< bool > RemoveIncompatibleFunctions("amdgpu-enable-remove-incompatible-functions", cl::Hidden, cl::desc("Enable removal of functions when they" "use features not supported by the target GPU"), cl::init(true))
static cl::opt< bool > EnableScalarIRPasses("amdgpu-scalar-ir-passes", cl::desc("Enable scalar IR passes"), cl::init(true), cl::Hidden)
static cl::opt< bool > EnableRegReassign("amdgpu-reassign-regs", cl::desc("Enable register reassign optimizations on gfx10+"), cl::init(true), cl::Hidden)
static cl::opt< bool > OptVGPRLiveRange("amdgpu-opt-vgpr-liverange", cl::desc("Enable VGPR liverange optimizations for if-else structure"), cl::init(true), cl::Hidden)
static ScheduleDAGInstrs * createSIMachineScheduler(MachineSchedContext *C)
static cl::opt< bool > EnablePreRAOptimizations("amdgpu-enable-pre-ra-optimizations", cl::desc("Enable Pre-RA optimizations pass"), cl::init(true), cl::Hidden)
static cl::opt< ScanOptions > AMDGPUAtomicOptimizerStrategy("amdgpu-atomic-optimizer-strategy", cl::desc("Select DPP or Iterative strategy for scan"), cl::init(ScanOptions::Iterative), cl::values(clEnumValN(ScanOptions::DPP, "DPP", "Use DPP operations for scan"), clEnumValN(ScanOptions::Iterative, "Iterative", "Use Iterative approach for scan"), clEnumValN(ScanOptions::None, "None", "Disable atomic optimizer")))
static cl::opt< bool > EnableVOPD("amdgpu-enable-vopd", cl::desc("Enable VOPD, dual issue of VALU in wave32"), cl::init(true), cl::Hidden)
static ScheduleDAGInstrs * createGCNMaxOccupancyMachineScheduler(MachineSchedContext *C)
static cl::opt< bool > EnableLowerExecSync("amdgpu-enable-lower-exec-sync", cl::desc("Enable lowering of execution synchronization."), cl::init(true), cl::Hidden)
static MachineSchedRegistry GCNILPSchedRegistry("gcn-iterative-ilp", "Run GCN iterative scheduler for ILP scheduling (experimental)", createIterativeILPMachineScheduler)
static cl::opt< bool > ScalarizeGlobal("amdgpu-scalarize-global-loads", cl::desc("Enable global load scalarization"), cl::init(true), cl::Hidden)
static const char RegAllocOptNotSupportedMessage[]
static MachineSchedRegistry GCNMaxOccupancySchedRegistry("gcn-max-occupancy", "Run GCN scheduler to maximize occupancy", createGCNMaxOccupancyMachineScheduler)
The AMDGPU TargetMachine interface definition for hw codegen targets.
This file declares the AMDGPU-specific subclass of TargetLoweringObjectFile.
This file a TargetTransformInfoImplBase conforming object specific to the AMDGPU target machine.
Provides passes to inlining "always_inline" functions.
#define X(NUM, ENUM, NAME)
Definition ELF.h:851
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
This header provides classes for managing passes over SCCs of the call graph.
Provides analysis for continuously CSEing during GISel passes.
Interfaces for producing common pass manager configurations.
#define clEnumValN(ENUMVAL, FLAGNAME, DESC)
#define LLVM_READNONE
Definition Compiler.h:315
#define LLVM_ABI
Definition Compiler.h:213
#define LLVM_EXTERNAL_VISIBILITY
Definition Compiler.h:132
DXIL Legalizer
This file provides the interface for a simple, fast CSE pass.
This file defines the class GCNIterativeScheduler, which uses an iterative approach to find a best sc...
This file provides the interface for LLVM's Global Value Numbering pass which eliminates fully redund...
#define _
AcceleratorCodeSelection - Identify all functions reachable from a kernel, removing those that are un...
This file declares the IRTranslator pass.
This header defines various interfaces for pass management in LLVM.
#define RegName(no)
This file provides the interface for LLVM's Loop Data Prefetching Pass.
This header provides classes for managing a pipeline of passes over loops in LLVM IR.
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
Register Reg
Register const TargetRegisterInfo * TRI
#define T
ConstantRange Range(APInt(BitWidth, Low), APInt(BitWidth, High))
uint64_t IntrinsicInst * II
#define P(N)
CGSCCAnalysisManager CGAM
LoopAnalysisManager LAM
FunctionAnalysisManager FAM
ModuleAnalysisManager MAM
PassInstrumentationCallbacks PIC
PassBuilder PB(Machine, PassOpts->PTO, std::nullopt, &PIC)
static bool isLTOPreLink(ThinOrFullLTOPhase Phase)
The AMDGPU TargetMachine interface definition for hw codegen targets.
This file describes the interface of the MachineFunctionPass responsible for assigning the generic vi...
const GCNTargetMachine & getTM(const GCNSubtarget *STI)
SI Machine Scheduler interface.
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
Definition Value.cpp:483
static TableGen::Emitter::Opt Y("gen-skeleton-entry", EmitSkeleton, "Generate example skeleton entry")
static FunctionPass * useDefaultRegisterAllocator()
-regalloc=... command line option.
static cl::opt< cl::boolOrDefault > EnableGlobalISelOption("global-isel", cl::Hidden, cl::desc("Enable the \"global\" instruction selector"))
Target-Independent Code Generator Pass Configuration Options pass.
LLVM IR instance of the generic uniformity analysis.
static std::unique_ptr< TargetLoweringObjectFile > createTLOF()
A manager for alias analyses.
void registerFunctionAnalysis()
Register a specific AA result.
void addAAResult(AAResultT &AAResult)
Register a specific AA result.
Legacy wrapper pass to provide the AMDGPUAAResult object.
Analysis pass providing a never-invalidated alias analysis result.
Lower llvm.global_ctors and llvm.global_dtors to special kernels.
AMDGPUTargetMachine & getAMDGPUTargetMachine() const
std::unique_ptr< CSEConfigBase > getCSEConfig() const override
Returns the CSEConfig object to use for the current optimization level.
bool isPassEnabled(const cl::opt< bool > &Opt, CodeGenOptLevel Level=CodeGenOptLevel::Default) const
Check if a pass is enabled given Opt option.
bool addPreISel() override
Methods with trivial inline returns are convenient points in the common codegen pass pipeline where t...
bool addInstSelector() override
addInstSelector - This method should install an instruction selector pass, which converts from LLVM c...
bool addGCPasses() override
addGCPasses - Add late codegen passes that analyze code for garbage collection.
AMDGPUPassConfig(TargetMachine &TM, PassManagerBase &PM)
void addIRPasses() override
Add common target configurable passes that perform LLVM IR to IR transforms following machine indepen...
void addCodeGenPrepare() override
Add pass to prepare the LLVM IR for code generation.
Splits the module M into N linkable partitions.
std::unique_ptr< TargetLoweringObjectFile > TLOF
unsigned getAddressSpaceForPseudoSourceKind(unsigned Kind) const override
getAddressSpaceForPseudoSourceKind - Given the kind of memory (e.g.
const TargetSubtargetInfo * getSubtargetImpl() const
void registerDefaultAliasAnalyses(AAManager &) override
Allow the target to register alias analyses with the AAManager for use with the new pass manager.
std::pair< const Value *, unsigned > getPredicatedAddrSpace(const Value *V) const override
If the specified predicate checks whether a generic pointer falls within a specified address space,...
StringRef getFeatureString(const Function &F) const
ScheduleDAGInstrs * createMachineScheduler(MachineSchedContext *C) const override
Create an instance of ScheduleDAGInstrs to be run within the standard MachineScheduler pass for this ...
AMDGPUTargetMachine(const Target &T, const Triple &TT, StringRef CPU, StringRef FS, const TargetOptions &Options, std::optional< Reloc::Model > RM, std::optional< CodeModel::Model > CM, CodeGenOptLevel OL)
bool isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override
Returns true if a cast between SrcAS and DestAS is a noop.
void registerPassBuilderCallbacks(PassBuilder &PB) override
Allow the target to modify the pass pipeline.
StringRef getGPUName(const Function &F) const
unsigned getAssumedAddrSpace(const Value *V) const override
If the specified generic pointer could be assumed as a pointer to a specific address space,...
bool splitModule(Module &M, unsigned NumParts, function_ref< void(std::unique_ptr< Module > MPart)> ModuleCallback) override
Entry point for module splitting.
Inlines functions marked as "always_inline".
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
Functions, function parameters, and return types can have attributes to indicate how they should be t...
Definition Attributes.h:105
LLVM_ABI StringRef getValueAsString() const
Return the attribute's value as a string.
bool isValid() const
Return true if the attribute is any kind of attribute.
Definition Attributes.h:261
This class provides access to building LLVM's passes.
CodeGenTargetMachineImpl(const Target &T, StringRef DataLayoutString, const Triple &TT, StringRef CPU, StringRef FS, const TargetOptions &Options, Reloc::Model RM, CodeModel::Model CM, CodeGenOptLevel OL)
LLVM_ABI void removeDeadConstantUsers() const
If there are any dead constant users dangling off of this constant, remove them.
Diagnostic information for unsupported feature in backend.
Lightweight error class with error context and mandatory checking.
Definition Error.h:159
static ErrorSuccess success()
Create a success value.
Definition Error.h:336
Tagged union holding either a T or a Error.
Definition Error.h:485
FunctionPass class - This class is used to implement most global optimizations.
Definition Pass.h:314
LowerIntrinsics - This pass rewrites calls to the llvm.gcread or llvm.gcwrite intrinsics,...
Definition GCMetadata.h:229
const SIRegisterInfo * getRegisterInfo() const override
TargetTransformInfo getTargetTransformInfo(const Function &F) const override
Get a TargetTransformInfo implementation for the target.
ScheduleDAGInstrs * createPostMachineScheduler(MachineSchedContext *C) const override
Similar to createMachineScheduler but used when postRA machine scheduling is enabled.
ScheduleDAGInstrs * createMachineScheduler(MachineSchedContext *C) const override
Create an instance of ScheduleDAGInstrs to be run within the standard MachineScheduler pass for this ...
void registerMachineRegisterInfoCallback(MachineFunction &MF) const override
bool parseMachineFunctionInfo(const yaml::MachineFunctionInfo &, PerFunctionMIParsingState &PFS, SMDiagnostic &Error, SMRange &SourceRange) const override
Parse out the target's MachineFunctionInfo from the YAML reprsentation.
yaml::MachineFunctionInfo * convertFuncInfoToYAML(const MachineFunction &MF) const override
Allocate and initialize an instance of the YAML representation of the MachineFunctionInfo.
yaml::MachineFunctionInfo * createDefaultFuncInfoYAML() const override
Allocate and return a default initialized instance of the YAML representation for the MachineFunction...
Error buildCodeGenPipeline(ModulePassManager &MPM, ModuleAnalysisManager &MAM, raw_pwrite_stream &Out, raw_pwrite_stream *DwoOut, CodeGenFileType FileType, const CGPassBuilderOption &Opts, MCContext &Ctx, PassInstrumentationCallbacks *PIC) override
TargetPassConfig * createPassConfig(PassManagerBase &PM) override
Create a pass configuration object to be used by addPassToEmitX methods for generating a pipeline of ...
GCNTargetMachine(const Target &T, const Triple &TT, StringRef CPU, StringRef FS, const TargetOptions &Options, std::optional< Reloc::Model > RM, std::optional< CodeModel::Model > CM, CodeGenOptLevel OL, bool JIT)
MachineFunctionInfo * createMachineFunctionInfo(BumpPtrAllocator &Allocator, const Function &F, const TargetSubtargetInfo *STI) const override
Create the target's instance of MachineFunctionInfo.
The core GVN pass object.
Definition GVN.h:128
Pass to remove unused function declarations.
Definition GlobalDCE.h:38
This pass is responsible for selecting generic machine instructions to target-specific instructions.
A pass that internalizes all functions and variables other than those that must be preserved accordin...
Definition Internalize.h:37
Converts loops into loop-closed SSA form.
Definition LCSSA.h:38
Performs Loop Invariant Code Motion Pass.
Definition LICM.h:66
This pass implements the localization mechanism described at the top of this file.
Definition Localizer.h:43
An optimization pass inserting data prefetches in loops.
Context object for machine code objects.
Definition MCContext.h:83
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
const TargetRegisterClass * getRegClass(Register Reg) const
Return the register class of the specified virtual register.
void addDelegate(Delegate *delegate)
const MachineFunction & getMF() const
MachineSchedRegistry provides a selection of available machine instruction schedulers.
This interface provides simple read-only access to a block of memory, and provides simple methods for...
virtual StringRef getBufferIdentifier() const
Return an identifier for this buffer, typically the filename it was read from.
const char * getBufferStart() const
A Module instance is used to store all the information related to an LLVM module.
Definition Module.h:67
static LLVM_ABI const OptimizationLevel O0
Disable as many optimizations as possible.
static LLVM_ABI const OptimizationLevel O1
Optimize quickly without destroying debuggability.
This class provides access to building LLVM's passes.
This class manages callbacks registration, as well as provides a way for PassInstrumentation to pass ...
LLVM_ATTRIBUTE_MINSIZE std::enable_if_t<!std::is_same_v< PassT, PassManager > > addPass(PassT &&Pass)
PreservedAnalyses run(IRUnitT &IR, AnalysisManagerT &AM, ExtraArgTs... ExtraArgs)
Run all of the passes in this manager over the given unit of IR.
PassRegistry - This class manages the registration and intitialization of the pass subsystem as appli...
static LLVM_ABI PassRegistry * getPassRegistry()
getPassRegistry - Access the global registry object, which is automatically initialized at applicatio...
Pass interface - Implemented by all 'passes'.
Definition Pass.h:99
This pass implements the reg bank selector pass used in the GlobalISel pipeline.
RegisterPassParser class - Handle the addition of new machine passes.
RegisterRegAllocBase class - Track the registration of register allocators.
Wrapper class representing virtual and physical registers.
Definition Register.h:20
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
bool initializeBaseYamlFields(const yaml::SIMachineFunctionInfo &YamlMFI, const MachineFunction &MF, PerFunctionMIParsingState &PFS, SMDiagnostic &Error, SMRange &SourceRange)
void setFlag(Register Reg, uint8_t Flag)
bool checkFlag(Register Reg, uint8_t Flag) const
Instances of this class encapsulate one diagnostic report, allowing printing to a raw_ostream as a ca...
Definition SourceMgr.h:297
Represents a location in source code.
Definition SMLoc.h:22
static SMLoc getFromPointer(const char *Ptr)
Definition SMLoc.h:35
Represents a range in source code.
Definition SMLoc.h:47
A ScheduleDAG for scheduling lists of MachineInstr.
ScheduleDAGMILive is an implementation of ScheduleDAGInstrs that schedules machine instructions while...
ScheduleDAGMI is an implementation of ScheduleDAGInstrs that simply schedules machine instructions ac...
void addMutation(std::unique_ptr< ScheduleDAGMutation > Mutation)
Add a postprocessing step to the DAG builder.
const TargetInstrInfo * TII
Target instruction information.
const TargetRegisterInfo * TRI
Target processor register info.
Move instructions into successor blocks when possible.
Definition Sink.h:24
SmallString - A SmallString is just a SmallVector with methods and accessors that make it work better...
Definition SmallString.h:26
void append(StringRef RHS)
Append from a StringRef.
Definition SmallString.h:68
unsigned getMainFileID() const
Definition SourceMgr.h:148
const MemoryBuffer * getMemoryBuffer(unsigned i) const
Definition SourceMgr.h:141
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
std::pair< StringRef, StringRef > split(char Separator) const
Split into two substrings around the first occurrence of a separator character.
Definition StringRef.h:730
constexpr bool empty() const
empty - Check if the string is empty.
Definition StringRef.h:140
bool consume_front(char Prefix)
Returns true if this StringRef has the given prefix and removes that prefix.
Definition StringRef.h:655
A switch()-like statement whose cases are string literals.
StringSwitch & Cases(std::initializer_list< StringLiteral > CaseStrings, T Value)
Primary interface to the complete machine description for the target machine.
CodeGenOptLevel getOptLevel() const
Returns the optimization level: None, Less, Default, or Aggressive.
Triple TargetTriple
Triple string, CPU name, and target feature strings the TargetMachine instance is created with.
const Triple & getTargetTriple() const
const MCSubtargetInfo * getMCSubtargetInfo() const
StringRef getTargetFeatureString() const
StringRef getTargetCPU() const
std::unique_ptr< const MCSubtargetInfo > STI
TargetOptions Options
void resetTargetOptions(const Function &F) const
Reset the target options based on the function's attributes.
std::unique_ptr< const MCRegisterInfo > MRI
CodeGenOptLevel OptLevel
Target-Independent Code Generator Pass Configuration Options.
virtual void addCodeGenPrepare()
Add pass to prepare the LLVM IR for code generation.
virtual bool addILPOpts()
Add passes that optimize instruction level parallelism for out-of-order targets.
virtual void addPostRegAlloc()
This method may be implemented by targets that want to run passes after register allocation pass pipe...
CodeGenOptLevel getOptLevel() const
virtual void addOptimizedRegAlloc()
addOptimizedRegAlloc - Add passes related to register allocation.
virtual void addIRPasses()
Add common target configurable passes that perform LLVM IR to IR transforms following machine indepen...
virtual void addFastRegAlloc()
addFastRegAlloc - Add the minimum set of target-independent passes that are required for fast registe...
virtual void addMachineSSAOptimization()
addMachineSSAOptimization - Add standard passes that optimize machine instructions in SSA form.
void disablePass(AnalysisID PassID)
Allow the target to disable a specific standard pass by default.
AnalysisID addPass(AnalysisID PassID)
Utilities for targets to add passes to the pass manager.
TargetPassConfig(TargetMachine &TM, PassManagerBase &PM)
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
TargetSubtargetInfo - Generic base class for all target subtargets.
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
Target - Wrapper for Target specific information.
Triple - Helper class for working with autoconf configuration names.
Definition Triple.h:47
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition Twine.h:82
LLVM Value Representation.
Definition Value.h:75
bool use_empty() const
Definition Value.h:346
int getNumOccurrences() const
An efficient, type-erasing, non-owning reference to a callable.
PassManagerBase - An abstract interface to allow code to add passes to a pass manager without having ...
An abstract base class for streams implementations that also support a pwrite operation.
Interfaces for registering analysis passes, producing common pass manager configurations,...
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ LOCAL_ADDRESS
Address space for local memory.
@ CONSTANT_ADDRESS
Address space for constant memory (VTX2).
@ FLAT_ADDRESS
Address space for flat memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
@ PRIVATE_ADDRESS
Address space for private memory.
StringRef getSchedStrategy(const Function &F)
bool isFlatGlobalAddrSpace(unsigned AS)
LLVM_READNONE constexpr bool isModuleEntryFunctionCC(CallingConv::ID CC)
LLVM_READNONE constexpr bool isEntryFunctionCC(CallingConv::ID CC)
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
BinaryOp_match< SrcTy, SpecificConstantMatch, TargetOpcode::G_XOR, true > m_Not(const SrcTy &&Src)
Matches a register not-ed by a G_XOR.
BinaryOp_match< LHS, RHS, Instruction::And, true > m_c_And(const LHS &L, const RHS &R)
Matches an And with LHS and RHS in either order.
bool match(Val *V, const Pattern &P)
IntrinsicID_match m_Intrinsic()
Match intrinsic calls like this: m_Intrinsic<Intrinsic::fabs>(m_Value(X))
auto m_Value()
Match an arbitrary value and ignore it.
deferredval_ty< Value > m_Deferred(Value *const &V)
Like m_Specific(), but works if the specific value to match is determined as part of the same match()...
template class LLVM_TEMPLATE_ABI opt< bool >
ValuesClass values(OptsTy... Options)
Helper to build a ValuesClass by forwarding a variable number of arguments as an initializer list to ...
initializer< Ty > init(const Ty &Val)
LocationClass< Ty > location(Ty &L)
This is an optimization pass for GlobalISel generic memory operations.
ScheduleDAGMILive * createSchedLive(MachineSchedContext *C)
Create the standard converging machine scheduler.
LLVM_ABI FunctionPass * createFlattenCFGPass()
std::unique_ptr< ScheduleDAGMutation > createAMDGPUBarrierLatencyDAGMutation(MachineFunction *MF)
LLVM_ABI FunctionPass * createFastRegisterAllocator()
FastRegisterAllocation Pass - This pass register allocates as fast as possible.
LLVM_ABI char & EarlyMachineLICMID
This pass performs loop invariant code motion on machine instructions.
ImmutablePass * createAMDGPUAAWrapperPass()
LLVM_ABI char & PostRAHazardRecognizerID
PostRAHazardRecognizer - This pass runs the post-ra hazard recognizer.
std::function< bool(const TargetRegisterInfo &TRI, const MachineRegisterInfo &MRI, const Register Reg)> RegAllocFilterFunc
Filter function for register classes during regalloc.
FunctionPass * createAMDGPUSetWavePriorityPass()
LLVM_ABI Pass * createLCSSAPass()
Definition LCSSA.cpp:525
void initializeAMDGPUMarkLastScratchLoadLegacyPass(PassRegistry &)
void initializeAMDGPUInsertDelayAluLegacyPass(PassRegistry &)
void initializeSIOptimizeExecMaskingPreRALegacyPass(PassRegistry &)
char & GCNPreRAOptimizationsID
LLVM_ABI char & GCLoweringID
GCLowering Pass - Used by gc.root to perform its default lowering operations.
void initializeSIInsertHardClausesLegacyPass(PassRegistry &)
ModulePass * createExpandVariadicsPass(ExpandVariadicsMode)
FunctionPass * createSIAnnotateControlFlowLegacyPass()
Create the annotation pass.
FunctionPass * createSIModeRegisterPass()
void initializeGCNPreRAOptimizationsLegacyPass(PassRegistry &)
void initializeSILowerWWMCopiesLegacyPass(PassRegistry &)
LLVM_ABI FunctionPass * createGreedyRegisterAllocator()
Greedy register allocation pass - This pass implements a global register allocator for optimized buil...
void initializeAMDGPUAAWrapperPassPass(PassRegistry &)
void initializeSIShrinkInstructionsLegacyPass(PassRegistry &)
ModulePass * createAMDGPULowerBufferFatPointersPass()
void initializeR600ClauseMergePassPass(PassRegistry &)
ModulePass * createAMDGPUCtorDtorLoweringLegacyPass()
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
ModuleToFunctionPassAdaptor createModuleToFunctionPassAdaptor(FunctionPassT &&Pass, bool EagerlyInvalidate=false)
A function to deduce a function pass type and wrap it in the templated adaptor.
ModulePass * createAMDGPUSwLowerLDSLegacyPass(const AMDGPUTargetMachine *TM=nullptr)
void initializeGCNRewritePartialRegUsesLegacyPass(llvm::PassRegistry &)
void initializeAMDGPURewriteUndefForPHILegacyPass(PassRegistry &)
char & GCNRewritePartialRegUsesID
void initializeAMDGPUSwLowerLDSLegacyPass(PassRegistry &)
LLVM_ABI std::error_code inconvertibleErrorCode()
The value returned by this function can be returned from convertToErrorCode for Error values where no...
Definition Error.cpp:94
void initializeAMDGPULowerVGPREncodingLegacyPass(PassRegistry &)
char & AMDGPUWaitSGPRHazardsLegacyID
void initializeSILowerSGPRSpillsLegacyPass(PassRegistry &)
LLVM_ABI Pass * createLoadStoreVectorizerPass()
Create a legacy pass manager instance of the LoadStoreVectorizer pass.
std::unique_ptr< ScheduleDAGMutation > createIGroupLPDAGMutation(AMDGPU::SchedulingPhase Phase)
Phase specifes whether or not this is a reentry into the IGroupLPDAGMutation.
void initializeAMDGPUDAGToDAGISelLegacyPass(PassRegistry &)
FunctionPass * createAMDGPURegBankCombiner(bool IsOptNone)
LLVM_ABI FunctionPass * createNaryReassociatePass()
char & AMDGPUReserveWWMRegsLegacyID
void initializeAMDGPUWaitSGPRHazardsLegacyPass(PassRegistry &)
LLVM_ABI char & PatchableFunctionID
This pass implements the "patchable-function" attribute.
char & SIOptimizeExecMaskingLegacyID
LLVM_ABI char & PostRASchedulerID
PostRAScheduler - This pass performs post register allocation scheduling.
void initializeR600ExpandSpecialInstrsPassPass(PassRegistry &)
void initializeR600PacketizerPass(PassRegistry &)
std::unique_ptr< ScheduleDAGMutation > createVOPDPairingMutation()
ModulePass * createAMDGPUExportKernelRuntimeHandlesLegacyPass()
ModulePass * createAMDGPUAlwaysInlinePass(bool GlobalOpt=true)
void initializeAMDGPUAsmPrinterPass(PassRegistry &)
void initializeSIFoldOperandsLegacyPass(PassRegistry &)
char & SILoadStoreOptimizerLegacyID
void initializeAMDGPUGlobalISelDivergenceLoweringPass(PassRegistry &)
PassManager< LazyCallGraph::SCC, CGSCCAnalysisManager, LazyCallGraph &, CGSCCUpdateResult & > CGSCCPassManager
The CGSCC pass manager.
LLVM_ABI std::unique_ptr< CSEConfigBase > getStandardCSEConfigForOpt(CodeGenOptLevel Level)
Definition CSEInfo.cpp:85
Target & getTheR600Target()
The target for R600 GPUs.
LLVM_ABI char & MachineSchedulerID
MachineScheduler - This pass schedules machine instructions.
LLVM_ABI Pass * createStructurizeCFGPass(bool SkipUniformRegions=false)
When SkipUniformRegions is true the structizer will not structurize regions that only contain uniform...
LLVM_ABI char & PostMachineSchedulerID
PostMachineScheduler - This pass schedules machine instructions postRA.
LLVM_ABI Pass * createLICMPass()
Definition LICM.cpp:386
char & SIFormMemoryClausesID
void initializeSILoadStoreOptimizerLegacyPass(PassRegistry &)
void initializeAMDGPULowerModuleLDSLegacyPass(PassRegistry &)
AnalysisManager< LazyCallGraph::SCC, LazyCallGraph & > CGSCCAnalysisManager
The CGSCC analysis manager.
void initializeAMDGPUCtorDtorLoweringLegacyPass(PassRegistry &)
LLVM_ABI char & EarlyIfConverterLegacyID
EarlyIfConverter - This pass performs if-conversion on SSA form by inserting cmov instructions.
AnalysisManager< Loop, LoopStandardAnalysisResults & > LoopAnalysisManager
The loop analysis manager.
FunctionPass * createAMDGPUUniformIntrinsicCombineLegacyPass()
void initializeAMDGPURegBankCombinerPass(PassRegistry &)
ThinOrFullLTOPhase
This enumerates the LLVM full LTO or ThinLTO optimization phases.
Definition Pass.h:77
@ FullLTOPostLink
Full LTO postlink (backend compile) phase.
Definition Pass.h:87
char & AMDGPUUnifyDivergentExitNodesID
void initializeAMDGPUPrepareAGPRAllocLegacyPass(PassRegistry &)
FunctionPass * createAMDGPUAtomicOptimizerPass(ScanOptions ScanStrategy)
FunctionPass * createAMDGPUPreloadKernArgPrologLegacyPass()
char & SIOptimizeVGPRLiveRangeLegacyID
LLVM_ABI char & ShadowStackGCLoweringID
ShadowStackGCLowering - Implements the custom lowering mechanism used by the shadow stack GC.
char & GCNNSAReassignID
void initializeAMDGPURewriteOutArgumentsPass(PassRegistry &)
static Reloc::Model getEffectiveRelocModel(std::optional< Reloc::Model > RM)
void initializeAMDGPUExternalAAWrapperPass(PassRegistry &)
auto formatv(bool Validate, const char *Fmt, Ts &&...Vals)
void initializeAMDGPULowerKernelArgumentsPass(PassRegistry &)
void initializeSIModeRegisterLegacyPass(PassRegistry &)
CodeModel::Model getEffectiveCodeModel(std::optional< CodeModel::Model > CM, CodeModel::Model Default)
Helper method for getting the code model, returning Default if CM does not have a value.
void initializeAMDGPUPreloadKernelArgumentsLegacyPass(PassRegistry &)
char & SILateBranchLoweringPassID
FunctionToLoopPassAdaptor createFunctionToLoopPassAdaptor(LoopPassT &&Pass, bool UseMemorySSA=false)
A function to deduce a loop pass type and wrap it in the templated adaptor.
LLVM_ABI char & BranchRelaxationPassID
BranchRelaxation - This pass replaces branches that need to jump further than is supported by a branc...
LLVM_ABI FunctionPass * createSinkingPass()
Definition Sink.cpp:275
CGSCCToFunctionPassAdaptor createCGSCCToFunctionPassAdaptor(FunctionPassT &&Pass, bool EagerlyInvalidate=false, bool NoRerun=false)
A function to deduce a function pass type and wrap it in the templated adaptor.
void initializeSIMemoryLegalizerLegacyPass(PassRegistry &)
ModulePass * createAMDGPULowerIntrinsicsLegacyPass()
void initializeR600MachineCFGStructurizerPass(PassRegistry &)
CodeGenFileType
These enums are meant to be passed into addPassesToEmitFile to indicate what type of file to emit,...
Definition CodeGen.h:111
char & GCNDPPCombineLegacyID
PassManager< Module > ModulePassManager
Convenience typedef for a pass manager over modules.
LLVM_ABI std::unique_ptr< ScheduleDAGMutation > createStoreClusterDAGMutation(const TargetInstrInfo *TII, const TargetRegisterInfo *TRI, bool ReorderWhileClustering=false)
If ReorderWhileClustering is set to true, no attempt will be made to reduce reordering due to store c...
LLVM_ABI FunctionPass * createLoopDataPrefetchPass()
FunctionPass * createAMDGPULowerKernelArgumentsPass()
char & AMDGPUInsertDelayAluID
std::unique_ptr< ScheduleDAGMutation > createAMDGPUMacroFusionDAGMutation()
Note that you have to add: DAG.addMutation(createAMDGPUMacroFusionDAGMutation()); to AMDGPUTargetMach...
LLVM_ABI char & StackMapLivenessID
StackMapLiveness - This pass analyses the register live-out set of stackmap/patchpoint intrinsics and...
void initializeGCNPreRALongBranchRegLegacyPass(PassRegistry &)
char & SILowerWWMCopiesLegacyID
LLVM_ABI FunctionPass * createUnifyLoopExitsPass()
char & SIOptimizeExecMaskingPreRAID
LLVM_ABI FunctionPass * createFixIrreduciblePass()
void initializeR600EmitClauseMarkersPass(PassRegistry &)
LLVM_ABI char & FuncletLayoutID
This pass lays out funclets contiguously.
LLVM_ABI char & DetectDeadLanesID
This pass adds dead/undef flags after analyzing subregister lanes.
void initializeAMDGPULowerExecSyncLegacyPass(PassRegistry &)
void initializeAMDGPUPostLegalizerCombinerPass(PassRegistry &)
ScheduleDAGInstrs * createGCNNoopPostMachineScheduler(MachineSchedContext *C)
void initializeAMDGPUExportKernelRuntimeHandlesLegacyPass(PassRegistry &)
CodeGenOptLevel
Code generation optimization level.
Definition CodeGen.h:82
void initializeSIInsertWaitcntsLegacyPass(PassRegistry &)
ModulePass * createAMDGPUPreloadKernelArgumentsLegacyPass(const TargetMachine *)
ModulePass * createAMDGPUPrintfRuntimeBinding()
LLVM_ABI char & StackSlotColoringID
StackSlotColoring - This pass performs stack slot coloring.
LLVM_ABI Pass * createAlwaysInlinerLegacyPass(bool InsertLifetime=true)
Create a legacy pass manager instance of a pass to inline and remove functions marked as "always_inli...
void initializeR600ControlFlowFinalizerPass(PassRegistry &)
void initializeAMDGPUImageIntrinsicOptimizerPass(PassRegistry &)
void initializeSILateBranchLoweringLegacyPass(PassRegistry &)
void initializeSILowerControlFlowLegacyPass(PassRegistry &)
void initializeSIFormMemoryClausesLegacyPass(PassRegistry &)
char & SIPreAllocateWWMRegsLegacyID
Error make_error(ArgTs &&... Args)
Make a Error instance representing failure using the given error info type.
Definition Error.h:340
ModulePass * createAMDGPULowerModuleLDSLegacyPass(const AMDGPUTargetMachine *TM=nullptr)
void initializeAMDGPUPreLegalizerCombinerPass(PassRegistry &)
FunctionPass * createAMDGPUPromoteAlloca()
LLVM_ABI FunctionPass * createSeparateConstOffsetFromGEPPass(bool LowerGEP=false)
void initializeAMDGPUReserveWWMRegsLegacyPass(PassRegistry &)
char & SIPreEmitPeepholeID
char & SIPostRABundlerLegacyID
ModulePass * createAMDGPURemoveIncompatibleFunctionsPass(const TargetMachine *)
void initializeGCNRegPressurePrinterPass(PassRegistry &)
void initializeSILowerI1CopiesLegacyPass(PassRegistry &)
char & SILowerSGPRSpillsLegacyID
LLVM_ABI FunctionPass * createBasicRegisterAllocator()
BasicRegisterAllocation Pass - This pass implements a degenerate global register allocator using the ...
LLVM_ABI void initializeGlobalISel(PassRegistry &)
Initialize all passes linked into the GlobalISel library.
char & SILowerControlFlowLegacyID
ModulePass * createR600OpenCLImageTypeLoweringPass()
FunctionPass * createAMDGPUCodeGenPreparePass()
void initializeSIAnnotateControlFlowLegacyPass(PassRegistry &)
FunctionPass * createAMDGPUISelDag(TargetMachine &TM, CodeGenOptLevel OptLevel)
This pass converts a legalized DAG into a AMDGPU-specific.
void initializeGCNCreateVOPDLegacyPass(PassRegistry &)
void initializeAMDGPUUniformIntrinsicCombineLegacyPass(PassRegistry &)
ScheduleDAGInstrs * createGCNCoExecMachineScheduler(MachineSchedContext *C)
void initializeSIPreAllocateWWMRegsLegacyPass(PassRegistry &)
void initializeSIFixVGPRCopiesLegacyPass(PassRegistry &)
Target & getTheGCNTarget()
The target for GCN GPUs.
void initializeSIFixSGPRCopiesLegacyPass(PassRegistry &)
void initializeAMDGPUAtomicOptimizerPass(PassRegistry &)
void initializeAMDGPULowerIntrinsicsLegacyPass(PassRegistry &)
LLVM_ABI FunctionPass * createGVNPass()
Create a legacy GVN pass.
Definition GVN.cpp:3406
void initializeAMDGPURewriteAGPRCopyMFMALegacyPass(PassRegistry &)
void initializeSIPostRABundlerLegacyPass(PassRegistry &)
FunctionPass * createAMDGPURegBankSelectPass()
FunctionPass * createAMDGPURegBankLegalizePass()
LLVM_ABI char & MachineCSELegacyID
MachineCSE - This pass performs global CSE on machine instructions.
char & SIWholeQuadModeID
LLVM_ABI std::unique_ptr< ScheduleDAGMutation > createLoadClusterDAGMutation(const TargetInstrInfo *TII, const TargetRegisterInfo *TRI, bool ReorderWhileClustering=false)
If ReorderWhileClustering is set to true, no attempt will be made to reduce reordering due to store c...
PassManager< Function > FunctionPassManager
Convenience typedef for a pass manager over functions.
LLVM_ABI char & LiveVariablesID
LiveVariables pass - This pass computes the set of blocks in which each variable is life and sets mac...
void initializeAMDGPUCodeGenPreparePass(PassRegistry &)
FunctionPass * createAMDGPURewriteUndefForPHILegacyPass()
void initializeSIOptimizeExecMaskingLegacyPass(PassRegistry &)
void call_once(once_flag &flag, Function &&F, Args &&... ArgList)
Execute the function specified as a parameter once.
Definition Threading.h:86
FunctionPass * createSILowerI1CopiesLegacyPass()
FunctionPass * createAMDGPUPostLegalizeCombiner(bool IsOptNone)
void initializeAMDGPULowerKernelAttributesPass(PassRegistry &)
char & SIInsertHardClausesID
char & SIFixSGPRCopiesLegacyID
void initializeGCNDPPCombineLegacyPass(PassRegistry &)
char & GCNCreateVOPDID
char & SIPeepholeSDWALegacyID
LLVM_ABI char & VirtRegRewriterID
VirtRegRewriter pass.
char & SIFixVGPRCopiesID
char & SIFoldOperandsLegacyID
void initializeGCNNSAReassignLegacyPass(PassRegistry &)
LLVM_ABI FunctionPass * createLowerSwitchPass()
void initializeAMDGPUPreloadKernArgPrologLegacyPass(PassRegistry &)
LLVM_ABI FunctionPass * createVirtRegRewriter(bool ClearVirtRegs=true)
void initializeR600VectorRegMergerPass(PassRegistry &)
char & AMDGPURewriteAGPRCopyMFMALegacyID
ModulePass * createAMDGPULowerExecSyncLegacyPass()
char & AMDGPULowerVGPREncodingLegacyID
FunctionPass * createAMDGPUGlobalISelDivergenceLoweringPass()
FunctionPass * createSIMemoryLegalizerPass()
void initializeAMDGPULateCodeGenPrepareLegacyPass(PassRegistry &)
void initializeSIOptimizeVGPRLiveRangeLegacyPass(PassRegistry &)
void initializeSIPeepholeSDWALegacyPass(PassRegistry &)
void initializeAMDGPURegBankLegalizePass(PassRegistry &)
LLVM_ABI char & TwoAddressInstructionPassID
TwoAddressInstruction - This pass reduces two-address instructions to use two operands.
AnalysisManager< Function > FunctionAnalysisManager
Convenience typedef for the Function analysis manager.
FunctionPass * createAMDGPUPreLegalizeCombiner(bool IsOptNone)
void initializeAMDGPURegBankSelectPass(PassRegistry &)
FunctionPass * createAMDGPULateCodeGenPrepareLegacyPass()
LLVM_ABI FunctionPass * createAtomicExpandLegacyPass()
AtomicExpandPass - At IR level this pass replace atomic instructions with __atomic_* library calls,...
MCRegisterInfo * createGCNMCRegisterInfo(AMDGPUDwarfFlavour DwarfFlavour)
LLVM_ABI FunctionPass * createStraightLineStrengthReducePass()
BumpPtrAllocatorImpl<> BumpPtrAllocator
The standard BumpPtrAllocator which just uses the default template parameters.
Definition Allocator.h:383
FunctionPass * createAMDGPUImageIntrinsicOptimizerPass(const TargetMachine *)
void initializeAMDGPUUnifyDivergentExitNodesPass(PassRegistry &)
void initializeAMDGPULowerBufferFatPointersPass(PassRegistry &)
FunctionPass * createSIInsertWaitcntsPass()
FunctionPass * createAMDGPUAnnotateUniformValuesLegacy()
LLVM_ABI FunctionPass * createEarlyCSEPass(bool UseMemorySSA=false)
void initializeSIWholeQuadModeLegacyPass(PassRegistry &)
LLVM_ABI char & PHIEliminationID
PHIElimination - This pass eliminates machine instruction PHI nodes by inserting copy instructions.
LLVM_ABI llvm::cl::opt< bool > NoKernelInfoEndLTO
bool parseNamedRegisterReference(PerFunctionMIParsingState &PFS, Register &Reg, StringRef Src, SMDiagnostic &Error)
void initializeAMDGPUResourceUsageAnalysisWrapperPassPass(PassRegistry &)
FunctionPass * createSIShrinkInstructionsLegacyPass()
char & AMDGPUPrepareAGPRAllocLegacyID
char & AMDGPUMarkLastScratchLoadID
LLVM_ABI char & RenameIndependentSubregsID
This pass detects subregister lanes in a virtual register that are used independently of other lanes ...
void initializeAMDGPUAnnotateUniformValuesLegacyPass(PassRegistry &)
std::unique_ptr< ScheduleDAGMutation > createAMDGPUExportClusteringDAGMutation()
void initializeAMDGPUPrintfRuntimeBindingPass(PassRegistry &)
void initializeAMDGPUPromoteAllocaPass(PassRegistry &)
void initializeAMDGPURemoveIncompatibleFunctionsLegacyPass(PassRegistry &)
std::unique_ptr< ScheduleDAGMutation > createAMDGPUHazardLatencyDAGMutation(MachineFunction *MF)
void initializeAMDGPUAlwaysInlinePass(PassRegistry &)
LLVM_ABI char & DeadMachineInstructionElimID
DeadMachineInstructionElim - This pass removes dead machine instructions.
void initializeSIPreEmitPeepholeLegacyPass(PassRegistry &)
AnalysisManager< Module > ModuleAnalysisManager
Convenience typedef for the Module analysis manager.
Definition MIRParser.h:39
char & AMDGPUPerfHintAnalysisLegacyID
LLVM_ABI ImmutablePass * createExternalAAWrapperPass(std::function< void(Pass &, Function &, AAResults &)> Callback)
A wrapper pass around a callback which can be used to populate the AAResults in the AAResultsWrapperP...
char & GCNPreRALongBranchRegID
LLVM_ABI CGPassBuilderOption getCGPassBuilderOption()
LLVM_ABI void reportFatalUsageError(Error Err)
Report a fatal error that does not indicate a bug in LLVM.
Definition Error.cpp:177
void initializeAMDGPUPromoteKernelArgumentsPass(PassRegistry &)
#define N
static ArgDescriptor createStack(unsigned Offset, unsigned Mask=~0u)
static ArgDescriptor createArg(const ArgDescriptor &Arg, unsigned Mask)
static ArgDescriptor createRegister(Register Reg, unsigned Mask=~0u)
DenormalModeKind Input
Denormal treatment kind for floating point instruction inputs in the default floating-point environme...
@ PreserveSign
The sign of a flushed-to-zero number is preserved in the sign of 0.
@ IEEE
IEEE-754 denormal numbers preserved.
DenormalModeKind Output
Denormal flushing mode for floating point instruction results in the default floating point environme...
A simple and fast domtree-based CSE pass.
Definition EarlyCSE.h:31
MachineFunctionInfo - This class can be derived from and used by targets to hold private target-speci...
static FuncInfoTy * create(BumpPtrAllocator &Allocator, const Function &F, const SubtargetTy *STI)
Factory function: default behavior is to call new using the supplied allocator.
MachineSchedContext provides enough context from the MachineScheduler pass for the target to instanti...
StringMap< VRegInfo * > VRegInfosNamed
Definition MIParser.h:178
DenseMap< Register, VRegInfo * > VRegInfos
Definition MIParser.h:177
RegisterTargetMachine - Helper template for registering a target machine implementation,...
A utility pass template to force an analysis result to be available.
bool DX10Clamp
Used by the vector ALU to force DX10-style treatment of NaNs: when set, clamp NaN to zero; otherwise,...
DenormalMode FP64FP16Denormals
If this is set, neither input or output denormals are flushed for both f64 and f16/v2f16 instructions...
bool IEEE
Floating point opcodes that support exception flag gathering quiet and propagate signaling NaN inputs...
DenormalMode FP32Denormals
If this is set, neither input or output denormals are flushed for most f32 instructions.
The llvm::once_flag structure.
Definition Threading.h:67
Targets should override this in a way that mirrors the implementation of llvm::MachineFunctionInfo.
SmallVector< StringValue > WWMReservedRegs
std::optional< SIArgumentInfo > ArgInfo
SmallVector< StringValue, 2 > SpillPhysVGPRS
A wrapper around std::string which contains a source range that's being set during parsing.