LLVM 20.0.0git
AMDGPUTargetMachine.cpp
Go to the documentation of this file.
1//===-- AMDGPUTargetMachine.cpp - TargetMachine for hw codegen targets-----===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// This file contains both AMDGPU target machine and the CodeGen pass builder.
11/// The AMDGPU target machine contains all of the hardware specific information
12/// needed to emit code for SI+ GPUs in the legacy pass manager pipeline. The
13/// CodeGen pass builder handles the pass pipeline for new pass manager.
14//
15//===----------------------------------------------------------------------===//
16
17#include "AMDGPUTargetMachine.h"
18#include "AMDGPU.h"
19#include "AMDGPUAliasAnalysis.h"
22#include "AMDGPUIGroupLP.h"
23#include "AMDGPUISelDAGToDAG.h"
24#include "AMDGPUMacroFusion.h"
28#include "AMDGPUSplitModule.h"
32#include "GCNDPPCombine.h"
34#include "GCNSchedStrategy.h"
35#include "GCNVOPDUtils.h"
36#include "R600.h"
37#include "R600TargetMachine.h"
38#include "SIFixSGPRCopies.h"
39#include "SIFixVGPRCopies.h"
40#include "SIFoldOperands.h"
42#include "SILowerControlFlow.h"
43#include "SILowerSGPRSpills.h"
44#include "SILowerWWMCopies.h"
46#include "SIMachineScheduler.h"
49#include "SIPeepholeSDWA.h"
68#include "llvm/CodeGen/Passes.h"
71#include "llvm/IR/IntrinsicsAMDGPU.h"
72#include "llvm/IR/PassManager.h"
79#include "llvm/Transforms/IPO.h"
102#include <optional>
103
104using namespace llvm;
105using namespace llvm::PatternMatch;
106
107namespace {
108class SGPRRegisterRegAlloc : public RegisterRegAllocBase<SGPRRegisterRegAlloc> {
109public:
110 SGPRRegisterRegAlloc(const char *N, const char *D, FunctionPassCtor C)
111 : RegisterRegAllocBase(N, D, C) {}
112};
113
114class VGPRRegisterRegAlloc : public RegisterRegAllocBase<VGPRRegisterRegAlloc> {
115public:
116 VGPRRegisterRegAlloc(const char *N, const char *D, FunctionPassCtor C)
117 : RegisterRegAllocBase(N, D, C) {}
118};
119
120class WWMRegisterRegAlloc : public RegisterRegAllocBase<WWMRegisterRegAlloc> {
121public:
122 WWMRegisterRegAlloc(const char *N, const char *D, FunctionPassCtor C)
123 : RegisterRegAllocBase(N, D, C) {}
124};
125
126static bool onlyAllocateSGPRs(const TargetRegisterInfo &TRI,
128 const Register Reg) {
129 const TargetRegisterClass *RC = MRI.getRegClass(Reg);
130 return static_cast<const SIRegisterInfo &>(TRI).isSGPRClass(RC);
131}
132
133static bool onlyAllocateVGPRs(const TargetRegisterInfo &TRI,
135 const Register Reg) {
136 const TargetRegisterClass *RC = MRI.getRegClass(Reg);
137 return !static_cast<const SIRegisterInfo &>(TRI).isSGPRClass(RC);
138}
139
140static bool onlyAllocateWWMRegs(const TargetRegisterInfo &TRI,
142 const Register Reg) {
143 const SIMachineFunctionInfo *MFI =
144 MRI.getMF().getInfo<SIMachineFunctionInfo>();
145 const TargetRegisterClass *RC = MRI.getRegClass(Reg);
146 return !static_cast<const SIRegisterInfo &>(TRI).isSGPRClass(RC) &&
148}
149
150/// -{sgpr|wwm|vgpr}-regalloc=... command line option.
151static FunctionPass *useDefaultRegisterAllocator() { return nullptr; }
152
153/// A dummy default pass factory indicates whether the register allocator is
154/// overridden on the command line.
155static llvm::once_flag InitializeDefaultSGPRRegisterAllocatorFlag;
156static llvm::once_flag InitializeDefaultVGPRRegisterAllocatorFlag;
157static llvm::once_flag InitializeDefaultWWMRegisterAllocatorFlag;
158
159static SGPRRegisterRegAlloc
160defaultSGPRRegAlloc("default",
161 "pick SGPR register allocator based on -O option",
163
164static cl::opt<SGPRRegisterRegAlloc::FunctionPassCtor, false,
166SGPRRegAlloc("sgpr-regalloc", cl::Hidden, cl::init(&useDefaultRegisterAllocator),
167 cl::desc("Register allocator to use for SGPRs"));
168
169static cl::opt<VGPRRegisterRegAlloc::FunctionPassCtor, false,
171VGPRRegAlloc("vgpr-regalloc", cl::Hidden, cl::init(&useDefaultRegisterAllocator),
172 cl::desc("Register allocator to use for VGPRs"));
173
174static cl::opt<WWMRegisterRegAlloc::FunctionPassCtor, false,
176 WWMRegAlloc("wwm-regalloc", cl::Hidden,
178 cl::desc("Register allocator to use for WWM registers"));
179
180static void initializeDefaultSGPRRegisterAllocatorOnce() {
181 RegisterRegAlloc::FunctionPassCtor Ctor = SGPRRegisterRegAlloc::getDefault();
182
183 if (!Ctor) {
184 Ctor = SGPRRegAlloc;
185 SGPRRegisterRegAlloc::setDefault(SGPRRegAlloc);
186 }
187}
188
189static void initializeDefaultVGPRRegisterAllocatorOnce() {
190 RegisterRegAlloc::FunctionPassCtor Ctor = VGPRRegisterRegAlloc::getDefault();
191
192 if (!Ctor) {
193 Ctor = VGPRRegAlloc;
194 VGPRRegisterRegAlloc::setDefault(VGPRRegAlloc);
195 }
196}
197
198static void initializeDefaultWWMRegisterAllocatorOnce() {
199 RegisterRegAlloc::FunctionPassCtor Ctor = WWMRegisterRegAlloc::getDefault();
200
201 if (!Ctor) {
202 Ctor = WWMRegAlloc;
203 WWMRegisterRegAlloc::setDefault(WWMRegAlloc);
204 }
205}
206
207static FunctionPass *createBasicSGPRRegisterAllocator() {
208 return createBasicRegisterAllocator(onlyAllocateSGPRs);
209}
210
211static FunctionPass *createGreedySGPRRegisterAllocator() {
212 return createGreedyRegisterAllocator(onlyAllocateSGPRs);
213}
214
215static FunctionPass *createFastSGPRRegisterAllocator() {
216 return createFastRegisterAllocator(onlyAllocateSGPRs, false);
217}
218
219static FunctionPass *createBasicVGPRRegisterAllocator() {
220 return createBasicRegisterAllocator(onlyAllocateVGPRs);
221}
222
223static FunctionPass *createGreedyVGPRRegisterAllocator() {
224 return createGreedyRegisterAllocator(onlyAllocateVGPRs);
225}
226
227static FunctionPass *createFastVGPRRegisterAllocator() {
228 return createFastRegisterAllocator(onlyAllocateVGPRs, true);
229}
230
231static FunctionPass *createBasicWWMRegisterAllocator() {
232 return createBasicRegisterAllocator(onlyAllocateWWMRegs);
233}
234
235static FunctionPass *createGreedyWWMRegisterAllocator() {
236 return createGreedyRegisterAllocator(onlyAllocateWWMRegs);
237}
238
239static FunctionPass *createFastWWMRegisterAllocator() {
240 return createFastRegisterAllocator(onlyAllocateWWMRegs, false);
241}
242
243static SGPRRegisterRegAlloc basicRegAllocSGPR(
244 "basic", "basic register allocator", createBasicSGPRRegisterAllocator);
245static SGPRRegisterRegAlloc greedyRegAllocSGPR(
246 "greedy", "greedy register allocator", createGreedySGPRRegisterAllocator);
247
248static SGPRRegisterRegAlloc fastRegAllocSGPR(
249 "fast", "fast register allocator", createFastSGPRRegisterAllocator);
250
251
252static VGPRRegisterRegAlloc basicRegAllocVGPR(
253 "basic", "basic register allocator", createBasicVGPRRegisterAllocator);
254static VGPRRegisterRegAlloc greedyRegAllocVGPR(
255 "greedy", "greedy register allocator", createGreedyVGPRRegisterAllocator);
256
257static VGPRRegisterRegAlloc fastRegAllocVGPR(
258 "fast", "fast register allocator", createFastVGPRRegisterAllocator);
259static WWMRegisterRegAlloc basicRegAllocWWMReg("basic",
260 "basic register allocator",
261 createBasicWWMRegisterAllocator);
262static WWMRegisterRegAlloc
263 greedyRegAllocWWMReg("greedy", "greedy register allocator",
264 createGreedyWWMRegisterAllocator);
265static WWMRegisterRegAlloc fastRegAllocWWMReg("fast", "fast register allocator",
266 createFastWWMRegisterAllocator);
267
269 return Phase == ThinOrFullLTOPhase::FullLTOPreLink ||
270 Phase == ThinOrFullLTOPhase::ThinLTOPreLink;
271}
272} // anonymous namespace
273
274static cl::opt<bool>
276 cl::desc("Run early if-conversion"),
277 cl::init(false));
278
279static cl::opt<bool>
280OptExecMaskPreRA("amdgpu-opt-exec-mask-pre-ra", cl::Hidden,
281 cl::desc("Run pre-RA exec mask optimizations"),
282 cl::init(true));
283
284static cl::opt<bool>
285 LowerCtorDtor("amdgpu-lower-global-ctor-dtor",
286 cl::desc("Lower GPU ctor / dtors to globals on the device."),
287 cl::init(true), cl::Hidden);
288
289// Option to disable vectorizer for tests.
291 "amdgpu-load-store-vectorizer",
292 cl::desc("Enable load store vectorizer"),
293 cl::init(true),
294 cl::Hidden);
295
296// Option to control global loads scalarization
298 "amdgpu-scalarize-global-loads",
299 cl::desc("Enable global load scalarization"),
300 cl::init(true),
301 cl::Hidden);
302
303// Option to run internalize pass.
305 "amdgpu-internalize-symbols",
306 cl::desc("Enable elimination of non-kernel functions and unused globals"),
307 cl::init(false),
308 cl::Hidden);
309
310// Option to inline all early.
312 "amdgpu-early-inline-all",
313 cl::desc("Inline all functions early"),
314 cl::init(false),
315 cl::Hidden);
316
318 "amdgpu-enable-remove-incompatible-functions", cl::Hidden,
319 cl::desc("Enable removal of functions when they"
320 "use features not supported by the target GPU"),
321 cl::init(true));
322
324 "amdgpu-sdwa-peephole",
325 cl::desc("Enable SDWA peepholer"),
326 cl::init(true));
327
329 "amdgpu-dpp-combine",
330 cl::desc("Enable DPP combiner"),
331 cl::init(true));
332
333// Enable address space based alias analysis
335 cl::desc("Enable AMDGPU Alias Analysis"),
336 cl::init(true));
337
338// Enable lib calls simplifications
340 "amdgpu-simplify-libcall",
341 cl::desc("Enable amdgpu library simplifications"),
342 cl::init(true),
343 cl::Hidden);
344
346 "amdgpu-ir-lower-kernel-arguments",
347 cl::desc("Lower kernel argument loads in IR pass"),
348 cl::init(true),
349 cl::Hidden);
350
352 "amdgpu-reassign-regs",
353 cl::desc("Enable register reassign optimizations on gfx10+"),
354 cl::init(true),
355 cl::Hidden);
356
358 "amdgpu-opt-vgpr-liverange",
359 cl::desc("Enable VGPR liverange optimizations for if-else structure"),
360 cl::init(true), cl::Hidden);
361
363 "amdgpu-atomic-optimizer-strategy",
364 cl::desc("Select DPP or Iterative strategy for scan"),
365 cl::init(ScanOptions::Iterative),
367 clEnumValN(ScanOptions::DPP, "DPP", "Use DPP operations for scan"),
368 clEnumValN(ScanOptions::Iterative, "Iterative",
369 "Use Iterative approach for scan"),
370 clEnumValN(ScanOptions::None, "None", "Disable atomic optimizer")));
371
372// Enable Mode register optimization
374 "amdgpu-mode-register",
375 cl::desc("Enable mode register pass"),
376 cl::init(true),
377 cl::Hidden);
378
379// Enable GFX11+ s_delay_alu insertion
380static cl::opt<bool>
381 EnableInsertDelayAlu("amdgpu-enable-delay-alu",
382 cl::desc("Enable s_delay_alu insertion"),
383 cl::init(true), cl::Hidden);
384
385// Enable GFX11+ VOPD
386static cl::opt<bool>
387 EnableVOPD("amdgpu-enable-vopd",
388 cl::desc("Enable VOPD, dual issue of VALU in wave32"),
389 cl::init(true), cl::Hidden);
390
391// Option is used in lit tests to prevent deadcoding of patterns inspected.
392static cl::opt<bool>
393EnableDCEInRA("amdgpu-dce-in-ra",
394 cl::init(true), cl::Hidden,
395 cl::desc("Enable machine DCE inside regalloc"));
396
397static cl::opt<bool> EnableSetWavePriority("amdgpu-set-wave-priority",
398 cl::desc("Adjust wave priority"),
399 cl::init(false), cl::Hidden);
400
402 "amdgpu-scalar-ir-passes",
403 cl::desc("Enable scalar IR passes"),
404 cl::init(true),
405 cl::Hidden);
406
407static cl::opt<bool>
408 EnableSwLowerLDS("amdgpu-enable-sw-lower-lds",
409 cl::desc("Enable lowering of lds to global memory pass "
410 "and asan instrument resulting IR."),
411 cl::init(true), cl::Hidden);
412
414 "amdgpu-enable-lower-module-lds", cl::desc("Enable lower module lds pass"),
416 cl::Hidden);
417
419 "amdgpu-enable-pre-ra-optimizations",
420 cl::desc("Enable Pre-RA optimizations pass"), cl::init(true),
421 cl::Hidden);
422
424 "amdgpu-enable-promote-kernel-arguments",
425 cl::desc("Enable promotion of flat kernel pointer arguments to global"),
426 cl::Hidden, cl::init(true));
427
429 "amdgpu-enable-image-intrinsic-optimizer",
430 cl::desc("Enable image intrinsic optimizer pass"), cl::init(true),
431 cl::Hidden);
432
433static cl::opt<bool>
434 EnableLoopPrefetch("amdgpu-loop-prefetch",
435 cl::desc("Enable loop data prefetch on AMDGPU"),
436 cl::Hidden, cl::init(false));
437
439 AMDGPUSchedStrategy("amdgpu-sched-strategy",
440 cl::desc("Select custom AMDGPU scheduling strategy."),
441 cl::Hidden, cl::init(""));
442
444 "amdgpu-enable-rewrite-partial-reg-uses",
445 cl::desc("Enable rewrite partial reg uses pass"), cl::init(true),
446 cl::Hidden);
447
449 "amdgpu-enable-hipstdpar",
450 cl::desc("Enable HIP Standard Parallelism Offload support"), cl::init(false),
451 cl::Hidden);
452
453static cl::opt<bool>
454 EnableAMDGPUAttributor("amdgpu-attributor-enable",
455 cl::desc("Enable AMDGPUAttributorPass"),
456 cl::init(true), cl::Hidden);
457
459 "new-reg-bank-select",
460 cl::desc("Run amdgpu-regbankselect and amdgpu-regbanklegalize instead of "
461 "regbankselect"),
462 cl::init(false), cl::Hidden);
463
465 "amdgpu-link-time-closed-world",
466 cl::desc("Whether has closed-world assumption at link time"),
467 cl::init(false), cl::Hidden);
468
470 // Register the target
473
551}
552
553static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) {
554 return std::make_unique<AMDGPUTargetObjectFile>();
555}
556
558 return new SIScheduleDAGMI(C);
559}
560
561static ScheduleDAGInstrs *
563 const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>();
564 ScheduleDAGMILive *DAG =
565 new GCNScheduleDAGMILive(C, std::make_unique<GCNMaxOccupancySchedStrategy>(C));
566 DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
567 if (ST.shouldClusterStores())
568 DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
569 DAG->addMutation(createIGroupLPDAGMutation(AMDGPU::SchedulingPhase::Initial));
570 DAG->addMutation(createAMDGPUMacroFusionDAGMutation());
571 DAG->addMutation(createAMDGPUExportClusteringDAGMutation());
572 return DAG;
573}
574
575static ScheduleDAGInstrs *
577 ScheduleDAGMILive *DAG =
578 new GCNScheduleDAGMILive(C, std::make_unique<GCNMaxILPSchedStrategy>(C));
579 DAG->addMutation(createIGroupLPDAGMutation(AMDGPU::SchedulingPhase::Initial));
580 return DAG;
581}
582
583static ScheduleDAGInstrs *
585 const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>();
587 C, std::make_unique<GCNMaxMemoryClauseSchedStrategy>(C));
588 DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
589 if (ST.shouldClusterStores())
590 DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
591 DAG->addMutation(createAMDGPUExportClusteringDAGMutation());
592 return DAG;
593}
594
595static ScheduleDAGInstrs *
597 const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>();
598 auto *DAG = new GCNIterativeScheduler(
600 DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
601 if (ST.shouldClusterStores())
602 DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
603 return DAG;
604}
605
607 return new GCNIterativeScheduler(C,
609}
610
611static ScheduleDAGInstrs *
613 const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>();
615 DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
616 if (ST.shouldClusterStores())
617 DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
618 DAG->addMutation(createAMDGPUMacroFusionDAGMutation());
619 return DAG;
620}
621
623SISchedRegistry("si", "Run SI's custom scheduler",
625
628 "Run GCN scheduler to maximize occupancy",
630
632 GCNMaxILPSchedRegistry("gcn-max-ilp", "Run GCN scheduler to maximize ilp",
634
636 "gcn-max-memory-clause", "Run GCN scheduler to maximize memory clause",
638
640 "gcn-iterative-max-occupancy-experimental",
641 "Run GCN scheduler to maximize occupancy (experimental)",
643
645 "gcn-iterative-minreg",
646 "Run GCN iterative scheduler for minimal register usage (experimental)",
648
650 "gcn-iterative-ilp",
651 "Run GCN iterative scheduler for ILP scheduling (experimental)",
653
655 if (TT.getArch() == Triple::r600) {
656 // 32-bit pointers.
657 return "e-p:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128"
658 "-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1";
659 }
660
661 // 32-bit private, local, and region pointers. 64-bit global, constant and
662 // flat. 160-bit non-integral fat buffer pointers that include a 128-bit
663 // buffer descriptor and a 32-bit offset, which are indexed by 32-bit values
664 // (address space 7), and 128-bit non-integral buffer resourcees (address
665 // space 8) which cannot be non-trivilally accessed by LLVM memory operations
666 // like getelementptr.
667 return "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32"
668 "-p7:160:256:256:32-p8:128:128-p9:192:256:256:32-i64:64-v16:16-v24:32-"
669 "v32:32-v48:64-v96:"
670 "128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-"
671 "G1-ni:7:8:9";
672}
673
676 if (!GPU.empty())
677 return GPU;
678
679 // Need to default to a target with flat support for HSA.
680 if (TT.getArch() == Triple::amdgcn)
681 return TT.getOS() == Triple::AMDHSA ? "generic-hsa" : "generic";
682
683 return "r600";
684}
685
686static Reloc::Model getEffectiveRelocModel(std::optional<Reloc::Model> RM) {
687 // The AMDGPU toolchain only supports generating shared objects, so we
688 // must always use PIC.
689 return Reloc::PIC_;
690}
691
693 StringRef CPU, StringRef FS,
694 const TargetOptions &Options,
695 std::optional<Reloc::Model> RM,
696 std::optional<CodeModel::Model> CM,
697 CodeGenOptLevel OptLevel)
699 T, computeDataLayout(TT), TT, getGPUOrDefault(TT, CPU), FS, Options,
701 getEffectiveCodeModel(CM, CodeModel::Small), OptLevel),
702 TLOF(createTLOF(getTargetTriple())) {
703 initAsmInfo();
704 if (TT.getArch() == Triple::amdgcn) {
705 if (getMCSubtargetInfo()->checkFeatures("+wavefrontsize64"))
707 else if (getMCSubtargetInfo()->checkFeatures("+wavefrontsize32"))
709 }
710}
711
714
716
718 Attribute GPUAttr = F.getFnAttribute("target-cpu");
719 return GPUAttr.isValid() ? GPUAttr.getValueAsString() : getTargetCPU();
720}
721
723 Attribute FSAttr = F.getFnAttribute("target-features");
724
725 return FSAttr.isValid() ? FSAttr.getValueAsString()
727}
728
729/// Predicate for Internalize pass.
730static bool mustPreserveGV(const GlobalValue &GV) {
731 if (const Function *F = dyn_cast<Function>(&GV))
732 return F->isDeclaration() || F->getName().starts_with("__asan_") ||
733 F->getName().starts_with("__sanitizer_") ||
734 AMDGPU::isEntryFunctionCC(F->getCallingConv());
735
737 return !GV.use_empty();
738}
739
742}
743
746 if (Params.empty())
748 Params.consume_front("strategy=");
749 auto Result = StringSwitch<std::optional<ScanOptions>>(Params)
750 .Case("dpp", ScanOptions::DPP)
751 .Cases("iterative", "", ScanOptions::Iterative)
752 .Case("none", ScanOptions::None)
753 .Default(std::nullopt);
754 if (Result)
755 return *Result;
756 return make_error<StringError>("invalid parameter", inconvertibleErrorCode());
757}
758
762 while (!Params.empty()) {
763 StringRef ParamName;
764 std::tie(ParamName, Params) = Params.split(';');
765 if (ParamName == "closed-world") {
766 Result.IsClosedWorld = true;
767 } else {
768 return make_error<StringError>(
769 formatv("invalid AMDGPUAttributor pass parameter '{0}' ", ParamName)
770 .str(),
772 }
773 }
774 return Result;
775}
776
778
779#define GET_PASS_REGISTRY "AMDGPUPassRegistry.def"
781
783 [](ModulePassManager &PM, OptimizationLevel Level) {
784 if (EnableHipStdPar)
786 });
787
792
793 if (Level == OptimizationLevel::O0)
794 return;
795
797
798 // We don't want to run internalization at per-module stage.
802 }
803
806 });
807
809 [](FunctionPassManager &FPM, OptimizationLevel Level) {
810 if (Level == OptimizationLevel::O0)
811 return;
812
816 });
817
819 [this](CGSCCPassManager &PM, OptimizationLevel Level) {
820 if (Level == OptimizationLevel::O0)
821 return;
822
824
825 // Add promote kernel arguments pass to the opt pipeline right before
826 // infer address spaces which is needed to do actual address space
827 // rewriting.
828 if (Level.getSpeedupLevel() > OptimizationLevel::O1.getSpeedupLevel() &&
831
832 // Add infer address spaces pass to the opt pipeline after inlining
833 // but before SROA to increase SROA opportunities.
835
836 // This should run after inlining to have any chance of doing
837 // anything, and before other cleanup optimizations.
839
840 if (Level != OptimizationLevel::O0) {
841 // Promote alloca to vector before SROA and loop unroll. If we
842 // manage to eliminate allocas before unroll we may choose to unroll
843 // less.
845 }
846
847 PM.addPass(createCGSCCToFunctionPassAdaptor(std::move(FPM)));
848 });
849
850 // FIXME: Why is AMDGPUAttributor not in CGSCC?
852 OptimizationLevel Level,
854 if (Level != OptimizationLevel::O0) {
855 if (!isLTOPreLink(Phase))
856 MPM.addPass(AMDGPUAttributorPass(*this));
857 }
858 });
859
861 [this](ModulePassManager &PM, OptimizationLevel Level) {
862 // We want to support the -lto-partitions=N option as "best effort".
863 // For that, we need to lower LDS earlier in the pipeline before the
864 // module is partitioned for codegen.
866 PM.addPass(AMDGPUSwLowerLDSPass(*this));
869 if (Level != OptimizationLevel::O0) {
870 // Do we really need internalization in LTO?
871 if (InternalizeSymbols) {
874 }
878 Opt.IsClosedWorld = true;
879 PM.addPass(AMDGPUAttributorPass(*this, Opt));
880 }
881 }
882 });
883
885 [](StringRef FilterName) -> RegAllocFilterFunc {
886 if (FilterName == "sgpr")
887 return onlyAllocateSGPRs;
888 if (FilterName == "vgpr")
889 return onlyAllocateVGPRs;
890 if (FilterName == "wwm")
891 return onlyAllocateWWMRegs;
892 return nullptr;
893 });
894}
895
896int64_t AMDGPUTargetMachine::getNullPointerValue(unsigned AddrSpace) {
897 return (AddrSpace == AMDGPUAS::LOCAL_ADDRESS ||
898 AddrSpace == AMDGPUAS::PRIVATE_ADDRESS ||
899 AddrSpace == AMDGPUAS::REGION_ADDRESS)
900 ? -1
901 : 0;
902}
903
905 unsigned DestAS) const {
906 return AMDGPU::isFlatGlobalAddrSpace(SrcAS) &&
908}
909
911 const auto *LD = dyn_cast<LoadInst>(V);
912 if (!LD) // TODO: Handle invariant load like constant.
914
915 // It must be a generic pointer loaded.
916 assert(V->getType()->getPointerAddressSpace() == AMDGPUAS::FLAT_ADDRESS);
917
918 const auto *Ptr = LD->getPointerOperand();
919 if (Ptr->getType()->getPointerAddressSpace() != AMDGPUAS::CONSTANT_ADDRESS)
921 // For a generic pointer loaded from the constant memory, it could be assumed
922 // as a global pointer since the constant memory is only populated on the
923 // host side. As implied by the offload programming model, only global
924 // pointers could be referenced on the host side.
926}
927
928std::pair<const Value *, unsigned>
930 if (auto *II = dyn_cast<IntrinsicInst>(V)) {
931 switch (II->getIntrinsicID()) {
932 case Intrinsic::amdgcn_is_shared:
933 return std::pair(II->getArgOperand(0), AMDGPUAS::LOCAL_ADDRESS);
934 case Intrinsic::amdgcn_is_private:
935 return std::pair(II->getArgOperand(0), AMDGPUAS::PRIVATE_ADDRESS);
936 default:
937 break;
938 }
939 return std::pair(nullptr, -1);
940 }
941 // Check the global pointer predication based on
942 // (!is_share(p) && !is_private(p)). Note that logic 'and' is commutative and
943 // the order of 'is_shared' and 'is_private' is not significant.
944 Value *Ptr;
945 if (match(
946 const_cast<Value *>(V),
947 m_c_And(m_Not(m_Intrinsic<Intrinsic::amdgcn_is_shared>(m_Value(Ptr))),
948 m_Not(m_Intrinsic<Intrinsic::amdgcn_is_private>(
949 m_Deferred(Ptr))))))
950 return std::pair(Ptr, AMDGPUAS::GLOBAL_ADDRESS);
951
952 return std::pair(nullptr, -1);
953}
954
955unsigned
957 switch (Kind) {
967 }
969}
970
972 Module &M, unsigned NumParts,
973 function_ref<void(std::unique_ptr<Module> MPart)> ModuleCallback) {
974 // FIXME(?): Would be better to use an already existing Analysis/PassManager,
975 // but all current users of this API don't have one ready and would need to
976 // create one anyway. Let's hide the boilerplate for now to keep it simple.
977
982
983 PassBuilder PB(this);
987
989 MPM.addPass(AMDGPUSplitModulePass(NumParts, ModuleCallback));
990 MPM.run(M, MAM);
991 return true;
992}
993
994//===----------------------------------------------------------------------===//
995// GCN Target Machine (SI+)
996//===----------------------------------------------------------------------===//
997
999 StringRef CPU, StringRef FS,
1000 const TargetOptions &Options,
1001 std::optional<Reloc::Model> RM,
1002 std::optional<CodeModel::Model> CM,
1003 CodeGenOptLevel OL, bool JIT)
1004 : AMDGPUTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL) {}
1005
1006const TargetSubtargetInfo *
1008 StringRef GPU = getGPUName(F);
1010
1011 SmallString<128> SubtargetKey(GPU);
1012 SubtargetKey.append(FS);
1013
1014 auto &I = SubtargetMap[SubtargetKey];
1015 if (!I) {
1016 // This needs to be done before we create a new subtarget since any
1017 // creation will depend on the TM and the code generation flags on the
1018 // function that reside in TargetOptions.
1020 I = std::make_unique<GCNSubtarget>(TargetTriple, GPU, FS, *this);
1021 }
1022
1023 I->setScalarizeGlobalBehavior(ScalarizeGlobal);
1024
1025 return I.get();
1026}
1027
1030 return TargetTransformInfo(GCNTTIImpl(this, F));
1031}
1032
1035 CodeGenFileType FileType, const CGPassBuilderOption &Opts,
1037 AMDGPUCodeGenPassBuilder CGPB(*this, Opts, PIC);
1038 return CGPB.buildPipeline(MPM, Out, DwoOut, FileType);
1039}
1040
1041//===----------------------------------------------------------------------===//
1042// AMDGPU Legacy Pass Setup
1043//===----------------------------------------------------------------------===//
1044
1045std::unique_ptr<CSEConfigBase> llvm::AMDGPUPassConfig::getCSEConfig() const {
1047}
1048
1049namespace {
1050
1051class GCNPassConfig final : public AMDGPUPassConfig {
1052public:
1053 GCNPassConfig(TargetMachine &TM, PassManagerBase &PM)
1054 : AMDGPUPassConfig(TM, PM) {
1055 // It is necessary to know the register usage of the entire call graph. We
1056 // allow calls without EnableAMDGPUFunctionCalls if they are marked
1057 // noinline, so this is always required.
1058 setRequiresCodeGenSCCOrder(true);
1059 substitutePass(&PostRASchedulerID, &PostMachineSchedulerID);
1060 }
1061
1062 GCNTargetMachine &getGCNTargetMachine() const {
1063 return getTM<GCNTargetMachine>();
1064 }
1065
1067 createMachineScheduler(MachineSchedContext *C) const override;
1068
1070 createPostMachineScheduler(MachineSchedContext *C) const override {
1072 C, std::make_unique<PostGenericScheduler>(C),
1073 /*RemoveKillFlags=*/true);
1074 const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>();
1076 if (ST.shouldClusterStores())
1078 DAG->addMutation(
1079 createIGroupLPDAGMutation(AMDGPU::SchedulingPhase::PostRA));
1080 if (isPassEnabled(EnableVOPD, CodeGenOptLevel::Less))
1082 return DAG;
1083 }
1084
1085 bool addPreISel() override;
1086 void addMachineSSAOptimization() override;
1087 bool addILPOpts() override;
1088 bool addInstSelector() override;
1089 bool addIRTranslator() override;
1090 void addPreLegalizeMachineIR() override;
1091 bool addLegalizeMachineIR() override;
1092 void addPreRegBankSelect() override;
1093 bool addRegBankSelect() override;
1094 void addPreGlobalInstructionSelect() override;
1095 bool addGlobalInstructionSelect() override;
1096 void addFastRegAlloc() override;
1097 void addOptimizedRegAlloc() override;
1098
1099 FunctionPass *createSGPRAllocPass(bool Optimized);
1100 FunctionPass *createVGPRAllocPass(bool Optimized);
1101 FunctionPass *createWWMRegAllocPass(bool Optimized);
1102 FunctionPass *createRegAllocPass(bool Optimized) override;
1103
1104 bool addRegAssignAndRewriteFast() override;
1105 bool addRegAssignAndRewriteOptimized() override;
1106
1107 bool addPreRewrite() override;
1108 void addPostRegAlloc() override;
1109 void addPreSched2() override;
1110 void addPreEmitPass() override;
1111};
1112
1113} // end anonymous namespace
1114
1116 : TargetPassConfig(TM, PM) {
1117 // Exceptions and StackMaps are not supported, so these passes will never do
1118 // anything.
1121 // Garbage collection is not supported.
1124}
1125
1129 else
1131}
1132
1137 // ReassociateGEPs exposes more opportunities for SLSR. See
1138 // the example in reassociate-geps-and-slsr.ll.
1140 // SeparateConstOffsetFromGEP and SLSR creates common expressions which GVN or
1141 // EarlyCSE can reuse.
1143 // Run NaryReassociate after EarlyCSE/GVN to be more effective.
1145 // NaryReassociate on GEPs creates redundant common expressions, so run
1146 // EarlyCSE after it.
1148}
1149
1152
1156
1157 // There is no reason to run these.
1161
1163 if (LowerCtorDtor)
1165
1168
1169 // This can be disabled by passing ::Disable here or on the command line
1170 // with --expand-variadics-override=disable.
1172
1173 // Function calls are not supported, so make sure we inline everything.
1176
1177 // Handle uses of OpenCL image2d_t, image3d_t and sampler_t arguments.
1178 if (Arch == Triple::r600)
1180
1181 // Replace OpenCL enqueued block function pointers with global variables.
1183
1184 // Lower LDS accesses to global memory pass if address sanitizer is enabled.
1185 if (EnableSwLowerLDS)
1187
1188 // Runs before PromoteAlloca so the latter can account for function uses
1191 }
1192
1195
1196 // Run atomic optimizer before Atomic Expand
1201 }
1202
1204
1207
1210
1214 AAResults &AAR) {
1215 if (auto *WrapperPass = P.getAnalysisIfAvailable<AMDGPUAAWrapperPass>())
1216 AAR.addAAResult(WrapperPass->getResult());
1217 }));
1218 }
1219
1221 // TODO: May want to move later or split into an early and late one.
1223 }
1224
1225 // Try to hoist loop invariant parts of divisions AMDGPUCodeGenPrepare may
1226 // have expanded.
1229 }
1230
1232
1233 // EarlyCSE is not always strong enough to clean up what LSR produces. For
1234 // example, GVN can combine
1235 //
1236 // %0 = add %a, %b
1237 // %1 = add %b, %a
1238 //
1239 // and
1240 //
1241 // %0 = shl nsw %a, 2
1242 // %1 = shl %a, 2
1243 //
1244 // but EarlyCSE can do neither of them.
1247}
1248
1251 // FIXME: This pass adds 2 hacky attributes that can be replaced with an
1252 // analysis, and should be removed.
1254 }
1255
1259
1261 // This lowering has been placed after codegenprepare to take advantage of
1262 // address mode matching (which is why it isn't put with the LDS lowerings).
1263 // It could be placed anywhere before uniformity annotations (an analysis
1264 // that it changes by splitting up fat pointers into their components)
1265 // but has been put before switch lowering and CFG flattening so that those
1266 // passes can run on the more optimized control flow this pass creates in
1267 // many cases.
1268 //
1269 // FIXME: This should ideally be put after the LoadStoreVectorizer.
1270 // However, due to some annoying facts about ResourceUsageAnalysis,
1271 // (especially as exercised in the resource-usage-dead-function test),
1272 // we need all the function passes codegenprepare all the way through
1273 // said resource usage analysis to run on the call graph produced
1274 // before codegenprepare runs (because codegenprepare will knock some
1275 // nodes out of the graph, which leads to function-level passes not
1276 // being run on them, which causes crashes in the resource usage analysis).
1278 // In accordance with the above FIXME, manually force all the
1279 // function-level passes into a CGSCCPassManager.
1280 addPass(new DummyCGSCCPass());
1281 }
1282
1284
1287
1288 // LowerSwitch pass may introduce unreachable blocks that can
1289 // cause unexpected behavior for subsequent passes. Placing it
1290 // here seems better that these blocks would get cleaned up by
1291 // UnreachableBlockElim inserted next in the pass flow.
1293}
1294
1298 return false;
1299}
1300
1303 return false;
1304}
1305
1307 // Do nothing. GC is not supported.
1308 return false;
1309}
1310
1313 const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>();
1315 DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
1316 if (ST.shouldClusterStores())
1317 DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
1318 return DAG;
1319}
1320
1321//===----------------------------------------------------------------------===//
1322// GCN Legacy Pass Setup
1323//===----------------------------------------------------------------------===//
1324
1325ScheduleDAGInstrs *GCNPassConfig::createMachineScheduler(
1326 MachineSchedContext *C) const {
1327 const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>();
1328 if (ST.enableSIScheduler())
1330
1331 Attribute SchedStrategyAttr =
1332 C->MF->getFunction().getFnAttribute("amdgpu-sched-strategy");
1333 StringRef SchedStrategy = SchedStrategyAttr.isValid()
1334 ? SchedStrategyAttr.getValueAsString()
1336
1337 if (SchedStrategy == "max-ilp")
1339
1340 if (SchedStrategy == "max-memory-clause")
1342
1344}
1345
1346bool GCNPassConfig::addPreISel() {
1348
1349 if (TM->getOptLevel() > CodeGenOptLevel::None)
1350 addPass(createSinkingPass());
1351
1352 if (TM->getOptLevel() > CodeGenOptLevel::None)
1354
1355 // Merge divergent exit nodes. StructurizeCFG won't recognize the multi-exit
1356 // regions formed by them.
1358 addPass(createFixIrreduciblePass());
1359 addPass(createUnifyLoopExitsPass());
1360 addPass(createStructurizeCFGPass(false)); // true -> SkipUniformRegions
1361
1364 // TODO: Move this right after structurizeCFG to avoid extra divergence
1365 // analysis. This depends on stopping SIAnnotateControlFlow from making
1366 // control flow modifications.
1368
1369 addPass(createLCSSAPass());
1370
1371 if (TM->getOptLevel() > CodeGenOptLevel::Less)
1373
1374 return false;
1375}
1376
1377void GCNPassConfig::addMachineSSAOptimization() {
1379
1380 // We want to fold operands after PeepholeOptimizer has run (or as part of
1381 // it), because it will eliminate extra copies making it easier to fold the
1382 // real source operand. We want to eliminate dead instructions after, so that
1383 // we see fewer uses of the copies. We then need to clean up the dead
1384 // instructions leftover after the operands are folded as well.
1385 //
1386 // XXX - Can we get away without running DeadMachineInstructionElim again?
1387 addPass(&SIFoldOperandsLegacyID);
1388 if (EnableDPPCombine)
1389 addPass(&GCNDPPCombineLegacyID);
1391 if (isPassEnabled(EnableSDWAPeephole)) {
1392 addPass(&SIPeepholeSDWALegacyID);
1393 addPass(&EarlyMachineLICMID);
1394 addPass(&MachineCSELegacyID);
1395 addPass(&SIFoldOperandsLegacyID);
1396 }
1399}
1400
1401bool GCNPassConfig::addILPOpts() {
1403 addPass(&EarlyIfConverterLegacyID);
1404
1406 return false;
1407}
1408
1409bool GCNPassConfig::addInstSelector() {
1411 addPass(&SIFixSGPRCopiesLegacyID);
1413 return false;
1414}
1415
1416bool GCNPassConfig::addIRTranslator() {
1417 addPass(new IRTranslator(getOptLevel()));
1418 return false;
1419}
1420
1421void GCNPassConfig::addPreLegalizeMachineIR() {
1422 bool IsOptNone = getOptLevel() == CodeGenOptLevel::None;
1423 addPass(createAMDGPUPreLegalizeCombiner(IsOptNone));
1424 addPass(new Localizer());
1425}
1426
1427bool GCNPassConfig::addLegalizeMachineIR() {
1428 addPass(new Legalizer());
1429 return false;
1430}
1431
1432void GCNPassConfig::addPreRegBankSelect() {
1433 bool IsOptNone = getOptLevel() == CodeGenOptLevel::None;
1434 addPass(createAMDGPUPostLegalizeCombiner(IsOptNone));
1436}
1437
1438bool GCNPassConfig::addRegBankSelect() {
1439 if (NewRegBankSelect) {
1442 } else {
1443 addPass(new RegBankSelect());
1444 }
1445 return false;
1446}
1447
1448void GCNPassConfig::addPreGlobalInstructionSelect() {
1449 bool IsOptNone = getOptLevel() == CodeGenOptLevel::None;
1450 addPass(createAMDGPURegBankCombiner(IsOptNone));
1451}
1452
1453bool GCNPassConfig::addGlobalInstructionSelect() {
1454 addPass(new InstructionSelect(getOptLevel()));
1455 return false;
1456}
1457
1458void GCNPassConfig::addFastRegAlloc() {
1459 // FIXME: We have to disable the verifier here because of PHIElimination +
1460 // TwoAddressInstructions disabling it.
1461
1462 // This must be run immediately after phi elimination and before
1463 // TwoAddressInstructions, otherwise the processing of the tied operand of
1464 // SI_ELSE will introduce a copy of the tied operand source after the else.
1466
1468
1470}
1471
1472void GCNPassConfig::addOptimizedRegAlloc() {
1473 if (EnableDCEInRA)
1475
1476 // FIXME: when an instruction has a Killed operand, and the instruction is
1477 // inside a bundle, seems only the BUNDLE instruction appears as the Kills of
1478 // the register in LiveVariables, this would trigger a failure in verifier,
1479 // we should fix it and enable the verifier.
1480 if (OptVGPRLiveRange)
1482
1483 // This must be run immediately after phi elimination and before
1484 // TwoAddressInstructions, otherwise the processing of the tied operand of
1485 // SI_ELSE will introduce a copy of the tied operand source after the else.
1487
1490
1491 if (isPassEnabled(EnablePreRAOptimizations))
1493
1494 // Allow the scheduler to run before SIWholeQuadMode inserts exec manipulation
1495 // instructions that cause scheduling barriers.
1497
1498 if (OptExecMaskPreRA)
1500
1501 // This is not an essential optimization and it has a noticeable impact on
1502 // compilation time, so we only enable it from O2.
1503 if (TM->getOptLevel() > CodeGenOptLevel::Less)
1505
1507}
1508
1509bool GCNPassConfig::addPreRewrite() {
1511 addPass(&GCNNSAReassignID);
1512 return true;
1513}
1514
1515FunctionPass *GCNPassConfig::createSGPRAllocPass(bool Optimized) {
1516 // Initialize the global default.
1517 llvm::call_once(InitializeDefaultSGPRRegisterAllocatorFlag,
1518 initializeDefaultSGPRRegisterAllocatorOnce);
1519
1520 RegisterRegAlloc::FunctionPassCtor Ctor = SGPRRegisterRegAlloc::getDefault();
1521 if (Ctor != useDefaultRegisterAllocator)
1522 return Ctor();
1523
1524 if (Optimized)
1525 return createGreedyRegisterAllocator(onlyAllocateSGPRs);
1526
1527 return createFastRegisterAllocator(onlyAllocateSGPRs, false);
1528}
1529
1530FunctionPass *GCNPassConfig::createVGPRAllocPass(bool Optimized) {
1531 // Initialize the global default.
1532 llvm::call_once(InitializeDefaultVGPRRegisterAllocatorFlag,
1533 initializeDefaultVGPRRegisterAllocatorOnce);
1534
1535 RegisterRegAlloc::FunctionPassCtor Ctor = VGPRRegisterRegAlloc::getDefault();
1536 if (Ctor != useDefaultRegisterAllocator)
1537 return Ctor();
1538
1539 if (Optimized)
1540 return createGreedyVGPRRegisterAllocator();
1541
1542 return createFastVGPRRegisterAllocator();
1543}
1544
1545FunctionPass *GCNPassConfig::createWWMRegAllocPass(bool Optimized) {
1546 // Initialize the global default.
1547 llvm::call_once(InitializeDefaultWWMRegisterAllocatorFlag,
1548 initializeDefaultWWMRegisterAllocatorOnce);
1549
1550 RegisterRegAlloc::FunctionPassCtor Ctor = WWMRegisterRegAlloc::getDefault();
1551 if (Ctor != useDefaultRegisterAllocator)
1552 return Ctor();
1553
1554 if (Optimized)
1555 return createGreedyWWMRegisterAllocator();
1556
1557 return createFastWWMRegisterAllocator();
1558}
1559
1560FunctionPass *GCNPassConfig::createRegAllocPass(bool Optimized) {
1561 llvm_unreachable("should not be used");
1562}
1563
1565 "-regalloc not supported with amdgcn. Use -sgpr-regalloc, -wwm-regalloc, "
1566 "and -vgpr-regalloc";
1567
1568bool GCNPassConfig::addRegAssignAndRewriteFast() {
1569 if (!usingDefaultRegAlloc())
1571
1572 addPass(&GCNPreRALongBranchRegID);
1573
1574 addPass(createSGPRAllocPass(false));
1575
1576 // Equivalent of PEI for SGPRs.
1577 addPass(&SILowerSGPRSpillsLegacyID);
1578
1579 // To Allocate wwm registers used in whole quad mode operations (for shaders).
1581
1582 // For allocating other wwm register operands.
1583 addPass(createWWMRegAllocPass(false));
1584
1585 addPass(&SILowerWWMCopiesLegacyID);
1586 addPass(&AMDGPUReserveWWMRegsID);
1587
1588 // For allocating per-thread VGPRs.
1589 addPass(createVGPRAllocPass(false));
1590
1591 return true;
1592}
1593
1594bool GCNPassConfig::addRegAssignAndRewriteOptimized() {
1595 if (!usingDefaultRegAlloc())
1597
1598 addPass(&GCNPreRALongBranchRegID);
1599
1600 addPass(createSGPRAllocPass(true));
1601
1602 // Commit allocated register changes. This is mostly necessary because too
1603 // many things rely on the use lists of the physical registers, such as the
1604 // verifier. This is only necessary with allocators which use LiveIntervals,
1605 // since FastRegAlloc does the replacements itself.
1606 addPass(createVirtRegRewriter(false));
1607
1608 // At this point, the sgpr-regalloc has been done and it is good to have the
1609 // stack slot coloring to try to optimize the SGPR spill stack indices before
1610 // attempting the custom SGPR spill lowering.
1611 addPass(&StackSlotColoringID);
1612
1613 // Equivalent of PEI for SGPRs.
1614 addPass(&SILowerSGPRSpillsLegacyID);
1615
1616 // To Allocate wwm registers used in whole quad mode operations (for shaders).
1618
1619 // For allocating other whole wave mode registers.
1620 addPass(createWWMRegAllocPass(true));
1621 addPass(&SILowerWWMCopiesLegacyID);
1622 addPass(createVirtRegRewriter(false));
1623 addPass(&AMDGPUReserveWWMRegsID);
1624
1625 // For allocating per-thread VGPRs.
1626 addPass(createVGPRAllocPass(true));
1627
1628 addPreRewrite();
1629 addPass(&VirtRegRewriterID);
1630
1632
1633 return true;
1634}
1635
1636void GCNPassConfig::addPostRegAlloc() {
1637 addPass(&SIFixVGPRCopiesID);
1638 if (getOptLevel() > CodeGenOptLevel::None)
1641}
1642
1643void GCNPassConfig::addPreSched2() {
1644 if (TM->getOptLevel() > CodeGenOptLevel::None)
1646 addPass(&SIPostRABundlerID);
1647}
1648
1649void GCNPassConfig::addPreEmitPass() {
1650 if (isPassEnabled(EnableVOPD, CodeGenOptLevel::Less))
1651 addPass(&GCNCreateVOPDID);
1652 addPass(createSIMemoryLegalizerPass());
1653 addPass(createSIInsertWaitcntsPass());
1654
1655 addPass(createSIModeRegisterPass());
1656
1657 if (getOptLevel() > CodeGenOptLevel::None)
1658 addPass(&SIInsertHardClausesID);
1659
1661 if (isPassEnabled(EnableSetWavePriority, CodeGenOptLevel::Less))
1663 if (getOptLevel() > CodeGenOptLevel::None)
1664 addPass(&SIPreEmitPeepholeID);
1665 // The hazard recognizer that runs as part of the post-ra scheduler does not
1666 // guarantee to be able handle all hazards correctly. This is because if there
1667 // are multiple scheduling regions in a basic block, the regions are scheduled
1668 // bottom up, so when we begin to schedule a region we don't know what
1669 // instructions were emitted directly before it.
1670 //
1671 // Here we add a stand-alone hazard recognizer pass which can handle all
1672 // cases.
1673 addPass(&PostRAHazardRecognizerID);
1674
1675 if (isPassEnabled(EnableInsertDelayAlu, CodeGenOptLevel::Less))
1676 addPass(&AMDGPUInsertDelayAluID);
1677
1678 addPass(&BranchRelaxationPassID);
1680}
1681
1683 return new GCNPassConfig(*this, PM);
1684}
1685
1687 MachineFunction &MF) const {
1689 MF.getRegInfo().addDelegate(MFI);
1690}
1691
1693 BumpPtrAllocator &Allocator, const Function &F,
1694 const TargetSubtargetInfo *STI) const {
1695 return SIMachineFunctionInfo::create<SIMachineFunctionInfo>(
1696 Allocator, F, static_cast<const GCNSubtarget *>(STI));
1697}
1698
1700 return new yaml::SIMachineFunctionInfo();
1701}
1702
1706 return new yaml::SIMachineFunctionInfo(
1707 *MFI, *MF.getSubtarget<GCNSubtarget>().getRegisterInfo(), MF);
1708}
1709
1712 SMDiagnostic &Error, SMRange &SourceRange) const {
1713 const yaml::SIMachineFunctionInfo &YamlMFI =
1714 static_cast<const yaml::SIMachineFunctionInfo &>(MFI_);
1715 MachineFunction &MF = PFS.MF;
1717 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1718
1719 if (MFI->initializeBaseYamlFields(YamlMFI, MF, PFS, Error, SourceRange))
1720 return true;
1721
1722 if (MFI->Occupancy == 0) {
1723 // Fixup the subtarget dependent default value.
1724 MFI->Occupancy = ST.getOccupancyWithWorkGroupSizes(MF).second;
1725 }
1726
1727 auto parseRegister = [&](const yaml::StringValue &RegName, Register &RegVal) {
1728 Register TempReg;
1729 if (parseNamedRegisterReference(PFS, TempReg, RegName.Value, Error)) {
1730 SourceRange = RegName.SourceRange;
1731 return true;
1732 }
1733 RegVal = TempReg;
1734
1735 return false;
1736 };
1737
1738 auto parseOptionalRegister = [&](const yaml::StringValue &RegName,
1739 Register &RegVal) {
1740 return !RegName.Value.empty() && parseRegister(RegName, RegVal);
1741 };
1742
1743 if (parseOptionalRegister(YamlMFI.VGPRForAGPRCopy, MFI->VGPRForAGPRCopy))
1744 return true;
1745
1746 if (parseOptionalRegister(YamlMFI.SGPRForEXECCopy, MFI->SGPRForEXECCopy))
1747 return true;
1748
1749 if (parseOptionalRegister(YamlMFI.LongBranchReservedReg,
1750 MFI->LongBranchReservedReg))
1751 return true;
1752
1753 auto diagnoseRegisterClass = [&](const yaml::StringValue &RegName) {
1754 // Create a diagnostic for a the register string literal.
1755 const MemoryBuffer &Buffer =
1756 *PFS.SM->getMemoryBuffer(PFS.SM->getMainFileID());
1757 Error = SMDiagnostic(*PFS.SM, SMLoc(), Buffer.getBufferIdentifier(), 1,
1758 RegName.Value.size(), SourceMgr::DK_Error,
1759 "incorrect register class for field", RegName.Value,
1760 {}, {});
1761 SourceRange = RegName.SourceRange;
1762 return true;
1763 };
1764
1765 if (parseRegister(YamlMFI.ScratchRSrcReg, MFI->ScratchRSrcReg) ||
1766 parseRegister(YamlMFI.FrameOffsetReg, MFI->FrameOffsetReg) ||
1767 parseRegister(YamlMFI.StackPtrOffsetReg, MFI->StackPtrOffsetReg))
1768 return true;
1769
1770 if (MFI->ScratchRSrcReg != AMDGPU::PRIVATE_RSRC_REG &&
1771 !AMDGPU::SGPR_128RegClass.contains(MFI->ScratchRSrcReg)) {
1772 return diagnoseRegisterClass(YamlMFI.ScratchRSrcReg);
1773 }
1774
1775 if (MFI->FrameOffsetReg != AMDGPU::FP_REG &&
1776 !AMDGPU::SGPR_32RegClass.contains(MFI->FrameOffsetReg)) {
1777 return diagnoseRegisterClass(YamlMFI.FrameOffsetReg);
1778 }
1779
1780 if (MFI->StackPtrOffsetReg != AMDGPU::SP_REG &&
1781 !AMDGPU::SGPR_32RegClass.contains(MFI->StackPtrOffsetReg)) {
1782 return diagnoseRegisterClass(YamlMFI.StackPtrOffsetReg);
1783 }
1784
1785 for (const auto &YamlReg : YamlMFI.WWMReservedRegs) {
1786 Register ParsedReg;
1787 if (parseRegister(YamlReg, ParsedReg))
1788 return true;
1789
1790 MFI->reserveWWMRegister(ParsedReg);
1791 }
1792
1793 for (const auto &[_, Info] : PFS.VRegInfosNamed) {
1794 MFI->setFlag(Info->VReg, Info->Flags);
1795 }
1796 for (const auto &[_, Info] : PFS.VRegInfos) {
1797 MFI->setFlag(Info->VReg, Info->Flags);
1798 }
1799
1800 for (const auto &YamlRegStr : YamlMFI.SpillPhysVGPRS) {
1801 Register ParsedReg;
1802 if (parseRegister(YamlRegStr, ParsedReg))
1803 return true;
1804 MFI->SpillPhysVGPRs.push_back(ParsedReg);
1805 }
1806
1807 auto parseAndCheckArgument = [&](const std::optional<yaml::SIArgument> &A,
1808 const TargetRegisterClass &RC,
1809 ArgDescriptor &Arg, unsigned UserSGPRs,
1810 unsigned SystemSGPRs) {
1811 // Skip parsing if it's not present.
1812 if (!A)
1813 return false;
1814
1815 if (A->IsRegister) {
1816 Register Reg;
1817 if (parseNamedRegisterReference(PFS, Reg, A->RegisterName.Value, Error)) {
1818 SourceRange = A->RegisterName.SourceRange;
1819 return true;
1820 }
1821 if (!RC.contains(Reg))
1822 return diagnoseRegisterClass(A->RegisterName);
1824 } else
1825 Arg = ArgDescriptor::createStack(A->StackOffset);
1826 // Check and apply the optional mask.
1827 if (A->Mask)
1828 Arg = ArgDescriptor::createArg(Arg, *A->Mask);
1829
1830 MFI->NumUserSGPRs += UserSGPRs;
1831 MFI->NumSystemSGPRs += SystemSGPRs;
1832 return false;
1833 };
1834
1835 if (YamlMFI.ArgInfo &&
1836 (parseAndCheckArgument(YamlMFI.ArgInfo->PrivateSegmentBuffer,
1837 AMDGPU::SGPR_128RegClass,
1838 MFI->ArgInfo.PrivateSegmentBuffer, 4, 0) ||
1839 parseAndCheckArgument(YamlMFI.ArgInfo->DispatchPtr,
1840 AMDGPU::SReg_64RegClass, MFI->ArgInfo.DispatchPtr,
1841 2, 0) ||
1842 parseAndCheckArgument(YamlMFI.ArgInfo->QueuePtr, AMDGPU::SReg_64RegClass,
1843 MFI->ArgInfo.QueuePtr, 2, 0) ||
1844 parseAndCheckArgument(YamlMFI.ArgInfo->KernargSegmentPtr,
1845 AMDGPU::SReg_64RegClass,
1846 MFI->ArgInfo.KernargSegmentPtr, 2, 0) ||
1847 parseAndCheckArgument(YamlMFI.ArgInfo->DispatchID,
1848 AMDGPU::SReg_64RegClass, MFI->ArgInfo.DispatchID,
1849 2, 0) ||
1850 parseAndCheckArgument(YamlMFI.ArgInfo->FlatScratchInit,
1851 AMDGPU::SReg_64RegClass,
1852 MFI->ArgInfo.FlatScratchInit, 2, 0) ||
1853 parseAndCheckArgument(YamlMFI.ArgInfo->PrivateSegmentSize,
1854 AMDGPU::SGPR_32RegClass,
1855 MFI->ArgInfo.PrivateSegmentSize, 0, 0) ||
1856 parseAndCheckArgument(YamlMFI.ArgInfo->LDSKernelId,
1857 AMDGPU::SGPR_32RegClass,
1858 MFI->ArgInfo.LDSKernelId, 0, 1) ||
1859 parseAndCheckArgument(YamlMFI.ArgInfo->WorkGroupIDX,
1860 AMDGPU::SGPR_32RegClass, MFI->ArgInfo.WorkGroupIDX,
1861 0, 1) ||
1862 parseAndCheckArgument(YamlMFI.ArgInfo->WorkGroupIDY,
1863 AMDGPU::SGPR_32RegClass, MFI->ArgInfo.WorkGroupIDY,
1864 0, 1) ||
1865 parseAndCheckArgument(YamlMFI.ArgInfo->WorkGroupIDZ,
1866 AMDGPU::SGPR_32RegClass, MFI->ArgInfo.WorkGroupIDZ,
1867 0, 1) ||
1868 parseAndCheckArgument(YamlMFI.ArgInfo->WorkGroupInfo,
1869 AMDGPU::SGPR_32RegClass,
1870 MFI->ArgInfo.WorkGroupInfo, 0, 1) ||
1871 parseAndCheckArgument(YamlMFI.ArgInfo->PrivateSegmentWaveByteOffset,
1872 AMDGPU::SGPR_32RegClass,
1873 MFI->ArgInfo.PrivateSegmentWaveByteOffset, 0, 1) ||
1874 parseAndCheckArgument(YamlMFI.ArgInfo->ImplicitArgPtr,
1875 AMDGPU::SReg_64RegClass,
1876 MFI->ArgInfo.ImplicitArgPtr, 0, 0) ||
1877 parseAndCheckArgument(YamlMFI.ArgInfo->ImplicitBufferPtr,
1878 AMDGPU::SReg_64RegClass,
1879 MFI->ArgInfo.ImplicitBufferPtr, 2, 0) ||
1880 parseAndCheckArgument(YamlMFI.ArgInfo->WorkItemIDX,
1881 AMDGPU::VGPR_32RegClass,
1882 MFI->ArgInfo.WorkItemIDX, 0, 0) ||
1883 parseAndCheckArgument(YamlMFI.ArgInfo->WorkItemIDY,
1884 AMDGPU::VGPR_32RegClass,
1885 MFI->ArgInfo.WorkItemIDY, 0, 0) ||
1886 parseAndCheckArgument(YamlMFI.ArgInfo->WorkItemIDZ,
1887 AMDGPU::VGPR_32RegClass,
1888 MFI->ArgInfo.WorkItemIDZ, 0, 0)))
1889 return true;
1890
1891 if (ST.hasIEEEMode())
1892 MFI->Mode.IEEE = YamlMFI.Mode.IEEE;
1893 if (ST.hasDX10ClampMode())
1894 MFI->Mode.DX10Clamp = YamlMFI.Mode.DX10Clamp;
1895
1896 // FIXME: Move proper support for denormal-fp-math into base MachineFunction
1897 MFI->Mode.FP32Denormals.Input = YamlMFI.Mode.FP32InputDenormals
1900 MFI->Mode.FP32Denormals.Output = YamlMFI.Mode.FP32OutputDenormals
1903
1910
1911 if (YamlMFI.HasInitWholeWave)
1912 MFI->setInitWholeWave();
1913
1914 return false;
1915}
1916
1917//===----------------------------------------------------------------------===//
1918// AMDGPU CodeGen Pass Builder interface.
1919//===----------------------------------------------------------------------===//
1920
1922 GCNTargetMachine &TM, const CGPassBuilderOption &Opts,
1924 : CodeGenPassBuilder(TM, Opts, PIC) {
1926 // Exceptions and StackMaps are not supported, so these passes will never do
1927 // anything.
1928 // Garbage collection is not supported.
1929 disablePass<StackMapLivenessPass, FuncletLayoutPass,
1931}
1932
1933void AMDGPUCodeGenPassBuilder::addIRPasses(AddIRPass &addPass) const {
1936
1938 if (LowerCtorDtor)
1939 addPass(AMDGPUCtorDtorLoweringPass());
1940
1943
1944 // This can be disabled by passing ::Disable here or on the command line
1945 // with --expand-variadics-override=disable.
1947
1948 addPass(AMDGPUAlwaysInlinePass());
1949 addPass(AlwaysInlinerPass());
1950
1952
1953 if (EnableSwLowerLDS)
1954 addPass(AMDGPUSwLowerLDSPass(TM));
1955
1956 // Runs before PromoteAlloca so the latter can account for function uses
1958 addPass(AMDGPULowerModuleLDSPass(TM));
1959
1961 addPass(InferAddressSpacesPass());
1962
1963 // Run atomic optimizer before Atomic Expand
1967
1968 addPass(AtomicExpandPass(&TM));
1969
1971 addPass(AMDGPUPromoteAllocaPass(TM));
1974
1975 // TODO: Handle EnableAMDGPUAliasAnalysis
1976
1977 // TODO: May want to move later or split into an early and late one.
1978 addPass(AMDGPUCodeGenPreparePass(TM));
1979
1980 // TODO: LICM
1981 }
1982
1983 Base::addIRPasses(addPass);
1984
1985 // EarlyCSE is not always strong enough to clean up what LSR produces. For
1986 // example, GVN can combine
1987 //
1988 // %0 = add %a, %b
1989 // %1 = add %b, %a
1990 //
1991 // and
1992 //
1993 // %0 = shl nsw %a, 2
1994 // %1 = shl %a, 2
1995 //
1996 // but EarlyCSE can do neither of them.
1998 addEarlyCSEOrGVNPass(addPass);
1999}
2000
2001void AMDGPUCodeGenPassBuilder::addCodeGenPrepare(AddIRPass &addPass) const {
2002 // AMDGPUAnnotateKernelFeaturesPass is missing here, but it will hopefully be
2003 // deleted soon.
2004
2007
2008 // This lowering has been placed after codegenprepare to take advantage of
2009 // address mode matching (which is why it isn't put with the LDS lowerings).
2010 // It could be placed anywhere before uniformity annotations (an analysis
2011 // that it changes by splitting up fat pointers into their components)
2012 // but has been put before switch lowering and CFG flattening so that those
2013 // passes can run on the more optimized control flow this pass creates in
2014 // many cases.
2015 //
2016 // FIXME: This should ideally be put after the LoadStoreVectorizer.
2017 // However, due to some annoying facts about ResourceUsageAnalysis,
2018 // (especially as exercised in the resource-usage-dead-function test),
2019 // we need all the function passes codegenprepare all the way through
2020 // said resource usage analysis to run on the call graph produced
2021 // before codegenprepare runs (because codegenprepare will knock some
2022 // nodes out of the graph, which leads to function-level passes not
2023 // being run on them, which causes crashes in the resource usage analysis).
2025
2026 Base::addCodeGenPrepare(addPass);
2027
2029 addPass(LoadStoreVectorizerPass());
2030
2031 // LowerSwitch pass may introduce unreachable blocks that can cause unexpected
2032 // behavior for subsequent passes. Placing it here seems better that these
2033 // blocks would get cleaned up by UnreachableBlockElim inserted next in the
2034 // pass flow.
2035 addPass(LowerSwitchPass());
2036}
2037
2038void AMDGPUCodeGenPassBuilder::addPreISel(AddIRPass &addPass) const {
2039
2041 addPass(FlattenCFGPass());
2042
2044 addPass(SinkingPass());
2045
2047
2048 // Merge divergent exit nodes. StructurizeCFG won't recognize the multi-exit
2049 // regions formed by them.
2050
2052 addPass(FixIrreduciblePass());
2053 addPass(UnifyLoopExitsPass());
2054 addPass(StructurizeCFGPass(/*SkipUniformRegions=*/false));
2055
2057
2058 addPass(SIAnnotateControlFlowPass(TM));
2059
2060 // TODO: Move this right after structurizeCFG to avoid extra divergence
2061 // analysis. This depends on stopping SIAnnotateControlFlow from making
2062 // control flow modifications.
2064
2065 addPass(LCSSAPass());
2066
2069
2070 // FIXME: Why isn't this queried as required from AMDGPUISelDAGToDAG, and why
2071 // isn't this in addInstSelector?
2073}
2074
2075void AMDGPUCodeGenPassBuilder::addILPOpts(AddMachinePass &addPass) const {
2077 addPass(EarlyIfConverterPass());
2078
2079 Base::addILPOpts(addPass);
2080}
2081
2082void AMDGPUCodeGenPassBuilder::addAsmPrinter(AddMachinePass &addPass,
2083 CreateMCStreamer) const {
2084 // TODO: Add AsmPrinter.
2085}
2086
2087Error AMDGPUCodeGenPassBuilder::addInstSelector(AddMachinePass &addPass) const {
2088 addPass(AMDGPUISelDAGToDAGPass(TM));
2089 addPass(SIFixSGPRCopiesPass());
2090 addPass(SILowerI1CopiesPass());
2091 return Error::success();
2092}
2093
2095 AddMachinePass &addPass) const {
2097
2098 addPass(SIFoldOperandsPass());
2099 if (EnableDPPCombine) {
2100 addPass(GCNDPPCombinePass());
2101 }
2102 addPass(SILoadStoreOptimizerPass());
2104 addPass(SIPeepholeSDWAPass());
2105 addPass(EarlyMachineLICMPass());
2106 addPass(MachineCSEPass());
2107 addPass(SIFoldOperandsPass());
2108 }
2110 addPass(SIShrinkInstructionsPass());
2111}
2112
2113void AMDGPUCodeGenPassBuilder::addPostRegAlloc(AddMachinePass &addPass) const {
2114 addPass(SIFixVGPRCopiesPass());
2116 addPass(SIOptimizeExecMaskingPass());
2117 Base::addPostRegAlloc(addPass);
2118}
2119
2121 CodeGenOptLevel Level) const {
2122 if (Opt.getNumOccurrences())
2123 return Opt;
2124 if (TM.getOptLevel() < Level)
2125 return false;
2126 return Opt;
2127}
2128
2131 addPass(GVNPass());
2132 else
2133 addPass(EarlyCSEPass());
2134}
2135
2137 AddIRPass &addPass) const {
2139 addPass(LoopDataPrefetchPass());
2140
2142
2143 // ReassociateGEPs exposes more opportunities for SLSR. See
2144 // the example in reassociate-geps-and-slsr.ll.
2146
2147 // SeparateConstOffsetFromGEP and SLSR creates common expressions which GVN or
2148 // EarlyCSE can reuse.
2149 addEarlyCSEOrGVNPass(addPass);
2150
2151 // Run NaryReassociate after EarlyCSE/GVN to be more effective.
2152 addPass(NaryReassociatePass());
2153
2154 // NaryReassociate on GEPs creates redundant common expressions, so run
2155 // EarlyCSE after it.
2156 addPass(EarlyCSEPass());
2157}
unsigned const MachineRegisterInfo * MRI
aarch64 falkor hwpf fix Falkor HW Prefetch Fix Late Phase
static cl::opt< bool > EnableEarlyIfConversion("aarch64-enable-early-ifcvt", cl::Hidden, cl::desc("Run early if-conversion"), cl::init(true))
This is the AMGPU address space based alias analysis pass.
Defines an instruction selector for the AMDGPU target.
Analyzes if a function potentially memory bound and if a kernel kernel may benefit from limiting numb...
static cl::opt< bool > EnableDCEInRA("amdgpu-dce-in-ra", cl::init(true), cl::Hidden, cl::desc("Enable machine DCE inside regalloc"))
static cl::opt< bool, true > EnableLowerModuleLDS("amdgpu-enable-lower-module-lds", cl::desc("Enable lower module lds pass"), cl::location(AMDGPUTargetMachine::EnableLowerModuleLDS), cl::init(true), cl::Hidden)
static MachineSchedRegistry GCNMaxMemoryClauseSchedRegistry("gcn-max-memory-clause", "Run GCN scheduler to maximize memory clause", createGCNMaxMemoryClauseMachineScheduler)
static MachineSchedRegistry SISchedRegistry("si", "Run SI's custom scheduler", createSIMachineScheduler)
static ScheduleDAGInstrs * createIterativeILPMachineScheduler(MachineSchedContext *C)
static cl::opt< bool > EarlyInlineAll("amdgpu-early-inline-all", cl::desc("Inline all functions early"), cl::init(false), cl::Hidden)
static cl::opt< bool > EnableSwLowerLDS("amdgpu-enable-sw-lower-lds", cl::desc("Enable lowering of lds to global memory pass " "and asan instrument resulting IR."), cl::init(true), cl::Hidden)
static cl::opt< bool > EnableLowerKernelArguments("amdgpu-ir-lower-kernel-arguments", cl::desc("Lower kernel argument loads in IR pass"), cl::init(true), cl::Hidden)
static ScheduleDAGInstrs * createGCNMaxILPMachineScheduler(MachineSchedContext *C)
static cl::opt< bool > EnableSDWAPeephole("amdgpu-sdwa-peephole", cl::desc("Enable SDWA peepholer"), cl::init(true))
static MachineSchedRegistry GCNMinRegSchedRegistry("gcn-iterative-minreg", "Run GCN iterative scheduler for minimal register usage (experimental)", createMinRegScheduler)
static cl::opt< bool > EnableImageIntrinsicOptimizer("amdgpu-enable-image-intrinsic-optimizer", cl::desc("Enable image intrinsic optimizer pass"), cl::init(true), cl::Hidden)
static cl::opt< bool > HasClosedWorldAssumption("amdgpu-link-time-closed-world", cl::desc("Whether has closed-world assumption at link time"), cl::init(false), cl::Hidden)
static ScheduleDAGInstrs * createGCNMaxMemoryClauseMachineScheduler(MachineSchedContext *C)
static cl::opt< bool > EnableSIModeRegisterPass("amdgpu-mode-register", cl::desc("Enable mode register pass"), cl::init(true), cl::Hidden)
static cl::opt< std::string > AMDGPUSchedStrategy("amdgpu-sched-strategy", cl::desc("Select custom AMDGPU scheduling strategy."), cl::Hidden, cl::init(""))
static cl::opt< bool > EnableDPPCombine("amdgpu-dpp-combine", cl::desc("Enable DPP combiner"), cl::init(true))
static MachineSchedRegistry IterativeGCNMaxOccupancySchedRegistry("gcn-iterative-max-occupancy-experimental", "Run GCN scheduler to maximize occupancy (experimental)", createIterativeGCNMaxOccupancyMachineScheduler)
static cl::opt< bool > EnableSetWavePriority("amdgpu-set-wave-priority", cl::desc("Adjust wave priority"), cl::init(false), cl::Hidden)
static cl::opt< bool > LowerCtorDtor("amdgpu-lower-global-ctor-dtor", cl::desc("Lower GPU ctor / dtors to globals on the device."), cl::init(true), cl::Hidden)
static cl::opt< bool > OptExecMaskPreRA("amdgpu-opt-exec-mask-pre-ra", cl::Hidden, cl::desc("Run pre-RA exec mask optimizations"), cl::init(true))
static cl::opt< bool > EnablePromoteKernelArguments("amdgpu-enable-promote-kernel-arguments", cl::desc("Enable promotion of flat kernel pointer arguments to global"), cl::Hidden, cl::init(true))
static cl::opt< bool > EnableRewritePartialRegUses("amdgpu-enable-rewrite-partial-reg-uses", cl::desc("Enable rewrite partial reg uses pass"), cl::init(true), cl::Hidden)
static cl::opt< bool > EnableLibCallSimplify("amdgpu-simplify-libcall", cl::desc("Enable amdgpu library simplifications"), cl::init(true), cl::Hidden)
static MachineSchedRegistry GCNMaxILPSchedRegistry("gcn-max-ilp", "Run GCN scheduler to maximize ilp", createGCNMaxILPMachineScheduler)
static cl::opt< bool > InternalizeSymbols("amdgpu-internalize-symbols", cl::desc("Enable elimination of non-kernel functions and unused globals"), cl::init(false), cl::Hidden)
static cl::opt< bool > EnableAMDGPUAttributor("amdgpu-attributor-enable", cl::desc("Enable AMDGPUAttributorPass"), cl::init(true), cl::Hidden)
static LLVM_READNONE StringRef getGPUOrDefault(const Triple &TT, StringRef GPU)
static Reloc::Model getEffectiveRelocModel(std::optional< Reloc::Model > RM)
Expected< AMDGPUAttributorOptions > parseAMDGPUAttributorPassOptions(StringRef Params)
static cl::opt< bool > EnableAMDGPUAliasAnalysis("enable-amdgpu-aa", cl::Hidden, cl::desc("Enable AMDGPU Alias Analysis"), cl::init(true))
static Expected< ScanOptions > parseAMDGPUAtomicOptimizerStrategy(StringRef Params)
static ScheduleDAGInstrs * createMinRegScheduler(MachineSchedContext *C)
static cl::opt< bool > EnableHipStdPar("amdgpu-enable-hipstdpar", cl::desc("Enable HIP Standard Parallelism Offload support"), cl::init(false), cl::Hidden)
static cl::opt< bool > EnableInsertDelayAlu("amdgpu-enable-delay-alu", cl::desc("Enable s_delay_alu insertion"), cl::init(true), cl::Hidden)
static ScheduleDAGInstrs * createIterativeGCNMaxOccupancyMachineScheduler(MachineSchedContext *C)
static cl::opt< bool > EnableLoadStoreVectorizer("amdgpu-load-store-vectorizer", cl::desc("Enable load store vectorizer"), cl::init(true), cl::Hidden)
static bool mustPreserveGV(const GlobalValue &GV)
Predicate for Internalize pass.
static cl::opt< bool > EnableLoopPrefetch("amdgpu-loop-prefetch", cl::desc("Enable loop data prefetch on AMDGPU"), cl::Hidden, cl::init(false))
LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget()
static cl::opt< bool > NewRegBankSelect("new-reg-bank-select", cl::desc("Run amdgpu-regbankselect and amdgpu-regbanklegalize instead of " "regbankselect"), cl::init(false), cl::Hidden)
static cl::opt< bool > RemoveIncompatibleFunctions("amdgpu-enable-remove-incompatible-functions", cl::Hidden, cl::desc("Enable removal of functions when they" "use features not supported by the target GPU"), cl::init(true))
static cl::opt< bool > EnableScalarIRPasses("amdgpu-scalar-ir-passes", cl::desc("Enable scalar IR passes"), cl::init(true), cl::Hidden)
static cl::opt< bool > EnableRegReassign("amdgpu-reassign-regs", cl::desc("Enable register reassign optimizations on gfx10+"), cl::init(true), cl::Hidden)
static cl::opt< bool > OptVGPRLiveRange("amdgpu-opt-vgpr-liverange", cl::desc("Enable VGPR liverange optimizations for if-else structure"), cl::init(true), cl::Hidden)
static ScheduleDAGInstrs * createSIMachineScheduler(MachineSchedContext *C)
static cl::opt< bool > EnablePreRAOptimizations("amdgpu-enable-pre-ra-optimizations", cl::desc("Enable Pre-RA optimizations pass"), cl::init(true), cl::Hidden)
static cl::opt< ScanOptions > AMDGPUAtomicOptimizerStrategy("amdgpu-atomic-optimizer-strategy", cl::desc("Select DPP or Iterative strategy for scan"), cl::init(ScanOptions::Iterative), cl::values(clEnumValN(ScanOptions::DPP, "DPP", "Use DPP operations for scan"), clEnumValN(ScanOptions::Iterative, "Iterative", "Use Iterative approach for scan"), clEnumValN(ScanOptions::None, "None", "Disable atomic optimizer")))
static cl::opt< bool > EnableVOPD("amdgpu-enable-vopd", cl::desc("Enable VOPD, dual issue of VALU in wave32"), cl::init(true), cl::Hidden)
static cl::opt< bool > EnableEarlyIfConversion("amdgpu-early-ifcvt", cl::Hidden, cl::desc("Run early if-conversion"), cl::init(false))
static ScheduleDAGInstrs * createGCNMaxOccupancyMachineScheduler(MachineSchedContext *C)
static MachineSchedRegistry GCNILPSchedRegistry("gcn-iterative-ilp", "Run GCN iterative scheduler for ILP scheduling (experimental)", createIterativeILPMachineScheduler)
static cl::opt< bool > ScalarizeGlobal("amdgpu-scalarize-global-loads", cl::desc("Enable global load scalarization"), cl::init(true), cl::Hidden)
static const char RegAllocOptNotSupportedMessage[]
static MachineSchedRegistry GCNMaxOccupancySchedRegistry("gcn-max-occupancy", "Run GCN scheduler to maximize occupancy", createGCNMaxOccupancyMachineScheduler)
The AMDGPU TargetMachine interface definition for hw codegen targets.
This file declares the AMDGPU-specific subclass of TargetLoweringObjectFile.
This file a TargetTransformInfo::Concept conforming object specific to the AMDGPU target machine.
Provides passes to inlining "always_inline" functions.
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
This header provides classes for managing passes over SCCs of the call graph.
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
Provides analysis for continuously CSEing during GISel passes.
#define clEnumValN(ENUMVAL, FLAGNAME, DESC)
Definition: CommandLine.h:686
#define LLVM_READNONE
Definition: Compiler.h:299
#define LLVM_EXTERNAL_VISIBILITY
Definition: Compiler.h:128
This file provides the interface for a simple, fast CSE pass.
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
This file defines the class GCNIterativeScheduler, which uses an iterative approach to find a best sc...
This file provides the interface for LLVM's Global Value Numbering pass which eliminates fully redund...
#define _
AcceleratorCodeSelection - Identify all functions reachable from a kernel, removing those that are un...
This file declares the IRTranslator pass.
This header defines various interfaces for pass management in LLVM.
#define RegName(no)
static LVOptions Options
Definition: LVOptions.cpp:25
static std::string computeDataLayout()
This file provides the interface for LLVM's Loop Data Prefetching Pass.
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
unsigned const TargetRegisterInfo * TRI
uint64_t IntrinsicInst * II
static GCMetadataPrinterRegistry::Add< OcamlGCMetadataPrinter > Y("ocaml", "ocaml 3.10-compatible collector")
#define P(N)
CGSCCAnalysisManager CGAM
LoopAnalysisManager LAM
FunctionAnalysisManager FAM
ModuleAnalysisManager MAM
PassInstrumentationCallbacks PIC
PassBuilder PB(Machine, PassOpts->PTO, std::nullopt, &PIC)
static bool isLTOPreLink(ThinOrFullLTOPhase Phase)
The AMDGPU TargetMachine interface definition for hw codegen targets.
Basic Register Allocator
This file describes the interface of the MachineFunctionPass responsible for assigning the generic vi...
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
SI Machine Scheduler interface.
static FunctionPass * useDefaultRegisterAllocator()
-regalloc=... command line option.
Target-Independent Code Generator Pass Configuration Options pass.
LLVM IR instance of the generic uniformity analysis.
static std::unique_ptr< TargetLoweringObjectFile > createTLOF()
A manager for alias analyses.
void registerFunctionAnalysis()
Register a specific AA result.
void addAAResult(AAResultT &AAResult)
Register a specific AA result.
Legacy wrapper pass to provide the AMDGPUAAResult object.
Analysis pass providing a never-invalidated alias analysis result.
Error addInstSelector(AddMachinePass &) const
void addMachineSSAOptimization(AddMachinePass &) const
void addEarlyCSEOrGVNPass(AddIRPass &) const
void addStraightLineScalarOptimizationPasses(AddIRPass &) const
AMDGPUCodeGenPassBuilder(GCNTargetMachine &TM, const CGPassBuilderOption &Opts, PassInstrumentationCallbacks *PIC)
void addPreISel(AddIRPass &addPass) const
void addAsmPrinter(AddMachinePass &, CreateMCStreamer) const
void addCodeGenPrepare(AddIRPass &) const
void addILPOpts(AddMachinePass &) const
void addPostRegAlloc(AddMachinePass &) const
bool isPassEnabled(const cl::opt< bool > &Opt, CodeGenOptLevel Level=CodeGenOptLevel::Default) const
Check if a pass is enabled given Opt option.
Lower llvm.global_ctors and llvm.global_dtors to special kernels.
AMDGPUTargetMachine & getAMDGPUTargetMachine() const
std::unique_ptr< CSEConfigBase > getCSEConfig() const override
Returns the CSEConfig object to use for the current optimization level.
ScheduleDAGInstrs * createMachineScheduler(MachineSchedContext *C) const override
Create an instance of ScheduleDAGInstrs to be run within the standard MachineScheduler pass for this ...
bool isPassEnabled(const cl::opt< bool > &Opt, CodeGenOptLevel Level=CodeGenOptLevel::Default) const
Check if a pass is enabled given Opt option.
bool addPreISel() override
Methods with trivial inline returns are convenient points in the common codegen pass pipeline where t...
bool addInstSelector() override
addInstSelector - This method should install an instruction selector pass, which converts from LLVM c...
bool addGCPasses() override
addGCPasses - Add late codegen passes that analyze code for garbage collection.
AMDGPUPassConfig(TargetMachine &TM, PassManagerBase &PM)
void addIRPasses() override
Add common target configurable passes that perform LLVM IR to IR transforms following machine indepen...
void addCodeGenPrepare() override
Add pass to prepare the LLVM IR for code generation.
Splits the module M into N linkable partitions.
static int64_t getNullPointerValue(unsigned AddrSpace)
Get the integer value of a null pointer in the given address space.
unsigned getAddressSpaceForPseudoSourceKind(unsigned Kind) const override
getAddressSpaceForPseudoSourceKind - Given the kind of memory (e.g.
const TargetSubtargetInfo * getSubtargetImpl() const
void registerDefaultAliasAnalyses(AAManager &) override
Allow the target to register alias analyses with the AAManager for use with the new pass manager.
std::pair< const Value *, unsigned > getPredicatedAddrSpace(const Value *V) const override
If the specified predicate checks whether a generic pointer falls within a specified address space,...
StringRef getFeatureString(const Function &F) const
AMDGPUTargetMachine(const Target &T, const Triple &TT, StringRef CPU, StringRef FS, const TargetOptions &Options, std::optional< Reloc::Model > RM, std::optional< CodeModel::Model > CM, CodeGenOptLevel OL)
bool isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override
Returns true if a cast between SrcAS and DestAS is a noop.
void registerPassBuilderCallbacks(PassBuilder &PB) override
Allow the target to modify the pass pipeline.
StringRef getGPUName(const Function &F) const
unsigned getAssumedAddrSpace(const Value *V) const override
If the specified generic pointer could be assumed as a pointer to a specific address space,...
bool splitModule(Module &M, unsigned NumParts, function_ref< void(std::unique_ptr< Module > MPart)> ModuleCallback) override
Entry point for module splitting.
Inlines functions marked as "always_inline".
Definition: AlwaysInliner.h:32
A container for analyses that lazily runs them and caches their results.
Definition: PassManager.h:253
StringRef getValueAsString() const
Return the attribute's value as a string.
Definition: Attributes.cpp:392
bool isValid() const
Return true if the attribute is any kind of attribute.
Definition: Attributes.h:208
Allocate memory in an ever growing pool, as if by bump-pointer.
Definition: Allocator.h:66
This class provides access to building LLVM's passes.
void addPostRegAlloc(AddMachinePass &) const
This method may be implemented by targets that want to run passes after register allocation pass pipe...
void addILPOpts(AddMachinePass &) const
Add passes that optimize instruction level parallelism for out-of-order targets.
Error buildPipeline(ModulePassManager &MPM, raw_pwrite_stream &Out, raw_pwrite_stream *DwoOut, CodeGenFileType FileType) const
void addMachineSSAOptimization(AddMachinePass &) const
Methods with trivial inline returns are convenient points in the common codegen pass pipeline where t...
void addCodeGenPrepare(AddIRPass &) const
Add pass to prepare the LLVM IR for code generation.
void disablePass()
Allow the target to disable a specific pass by default.
void addIRPasses(AddIRPass &) const
Add common target configurable passes that perform LLVM IR to IR transforms following machine indepen...
implements a set of functionality in the TargetMachine class for targets that make use of the indepen...
void removeDeadConstantUsers() const
If there are any dead constant users dangling off of this constant, remove them.
Definition: Constants.cpp:739
This pass is required by interprocedural register allocation.
Lightweight error class with error context and mandatory checking.
Definition: Error.h:160
static ErrorSuccess success()
Create a success value.
Definition: Error.h:337
Tagged union holding either a T or a Error.
Definition: Error.h:481
FunctionPass class - This class is used to implement most global optimizations.
Definition: Pass.h:310
const SIRegisterInfo * getRegisterInfo() const override
Definition: GCNSubtarget.h:291
TargetTransformInfo getTargetTransformInfo(const Function &F) const override
Get a TargetTransformInfo implementation for the target.
void registerMachineRegisterInfoCallback(MachineFunction &MF) const override
bool parseMachineFunctionInfo(const yaml::MachineFunctionInfo &, PerFunctionMIParsingState &PFS, SMDiagnostic &Error, SMRange &SourceRange) const override
Parse out the target's MachineFunctionInfo from the YAML reprsentation.
yaml::MachineFunctionInfo * convertFuncInfoToYAML(const MachineFunction &MF) const override
Allocate and initialize an instance of the YAML representation of the MachineFunctionInfo.
Error buildCodeGenPipeline(ModulePassManager &MPM, raw_pwrite_stream &Out, raw_pwrite_stream *DwoOut, CodeGenFileType FileType, const CGPassBuilderOption &Opts, PassInstrumentationCallbacks *PIC) override
yaml::MachineFunctionInfo * createDefaultFuncInfoYAML() const override
Allocate and return a default initialized instance of the YAML representation for the MachineFunction...
TargetPassConfig * createPassConfig(PassManagerBase &PM) override
Create a pass configuration object to be used by addPassToEmitX methods for generating a pipeline of ...
GCNTargetMachine(const Target &T, const Triple &TT, StringRef CPU, StringRef FS, const TargetOptions &Options, std::optional< Reloc::Model > RM, std::optional< CodeModel::Model > CM, CodeGenOptLevel OL, bool JIT)
MachineFunctionInfo * createMachineFunctionInfo(BumpPtrAllocator &Allocator, const Function &F, const TargetSubtargetInfo *STI) const override
Create the target's instance of MachineFunctionInfo.
The core GVN pass object.
Definition: GVN.h:124
Pass to remove unused function declarations.
Definition: GlobalDCE.h:36
This pass is responsible for selecting generic machine instructions to target-specific instructions.
A pass that internalizes all functions and variables other than those that must be preserved accordin...
Definition: Internalize.h:36
Converts loops into loop-closed SSA form.
Definition: LCSSA.h:37
This pass implements the localization mechanism described at the top of this file.
Definition: Localizer.h:43
An optimization pass inserting data prefetches in loops.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
void addDelegate(Delegate *delegate)
MachineSchedRegistry provides a selection of available machine instruction schedulers.
This interface provides simple read-only access to a block of memory, and provides simple methods for...
Definition: MemoryBuffer.h:51
virtual StringRef getBufferIdentifier() const
Return an identifier for this buffer, typically the filename it was read from.
Definition: MemoryBuffer.h:76
A Module instance is used to store all the information related to an LLVM module.
Definition: Module.h:65
static const OptimizationLevel O0
Disable as many optimizations as possible.
unsigned getSpeedupLevel() const
static const OptimizationLevel O1
Optimize quickly without destroying debuggability.
This class provides access to building LLVM's passes.
Definition: PassBuilder.h:105
void registerPipelineEarlySimplificationEPCallback(const std::function< void(ModulePassManager &, OptimizationLevel, ThinOrFullLTOPhase)> &C)
Register a callback for a default optimizer pipeline extension point.
Definition: PassBuilder.h:482
void registerPipelineStartEPCallback(const std::function< void(ModulePassManager &, OptimizationLevel)> &C)
Register a callback for a default optimizer pipeline extension point.
Definition: PassBuilder.h:473
void crossRegisterProxies(LoopAnalysisManager &LAM, FunctionAnalysisManager &FAM, CGSCCAnalysisManager &CGAM, ModuleAnalysisManager &MAM, MachineFunctionAnalysisManager *MFAM=nullptr)
Cross register the analysis managers through their proxies.
void registerOptimizerLastEPCallback(const std::function< void(ModulePassManager &, OptimizationLevel, ThinOrFullLTOPhase)> &C)
Register a callback for a default optimizer pipeline extension point.
Definition: PassBuilder.h:502
void registerPeepholeEPCallback(const std::function< void(FunctionPassManager &, OptimizationLevel)> &C)
Register a callback for a default optimizer pipeline extension point.
Definition: PassBuilder.h:407
void registerCGSCCOptimizerLateEPCallback(const std::function< void(CGSCCPassManager &, OptimizationLevel)> &C)
Register a callback for a default optimizer pipeline extension point.
Definition: PassBuilder.h:452
void registerRegClassFilterParsingCallback(const std::function< RegAllocFilterFunc(StringRef)> &C)
Register callbacks to parse target specific filter field if regalloc pass needs it.
Definition: PassBuilder.h:592
void registerModuleAnalyses(ModuleAnalysisManager &MAM)
Registers all available module analysis passes.
void registerFullLinkTimeOptimizationLastEPCallback(const std::function< void(ModulePassManager &, OptimizationLevel)> &C)
Register a callback for a default optimizer pipeline extension point.
Definition: PassBuilder.h:521
void registerFunctionAnalyses(FunctionAnalysisManager &FAM)
Registers all available function analysis passes.
LLVM_ATTRIBUTE_MINSIZE std::enable_if_t<!std::is_same_v< PassT, PassManager > > addPass(PassT &&Pass)
Definition: PassManager.h:195
PreservedAnalyses run(IRUnitT &IR, AnalysisManagerT &AM, ExtraArgTs... ExtraArgs)
Run all of the passes in this manager over the given unit of IR.
PassRegistry - This class manages the registration and intitialization of the pass subsystem as appli...
Definition: PassRegistry.h:37
static PassRegistry * getPassRegistry()
getPassRegistry - Access the global registry object, which is automatically initialized at applicatio...
Pass interface - Implemented by all 'passes'.
Definition: Pass.h:94
This pass implements the reg bank selector pass used in the GlobalISel pipeline.
Definition: RegBankSelect.h:91
RegisterPassParser class - Handle the addition of new machine passes.
RegisterRegAllocBase class - Track the registration of register allocators.
Wrapper class representing virtual and physical registers.
Definition: Register.h:19
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
bool initializeBaseYamlFields(const yaml::SIMachineFunctionInfo &YamlMFI, const MachineFunction &MF, PerFunctionMIParsingState &PFS, SMDiagnostic &Error, SMRange &SourceRange)
void setFlag(Register Reg, uint8_t Flag)
bool checkFlag(Register Reg, uint8_t Flag) const
Instances of this class encapsulate one diagnostic report, allowing printing to a raw_ostream as a ca...
Definition: SourceMgr.h:281
Represents a location in source code.
Definition: SMLoc.h:23
Represents a range in source code.
Definition: SMLoc.h:48
A ScheduleDAG for scheduling lists of MachineInstr.
ScheduleDAGMILive is an implementation of ScheduleDAGInstrs that schedules machine instructions while...
ScheduleDAGMI is an implementation of ScheduleDAGInstrs that simply schedules machine instructions ac...
void addMutation(std::unique_ptr< ScheduleDAGMutation > Mutation)
Add a postprocessing step to the DAG builder.
const TargetInstrInfo * TII
Target instruction information.
Definition: ScheduleDAG.h:575
const TargetRegisterInfo * TRI
Target processor register info.
Definition: ScheduleDAG.h:576
Move instructions into successor blocks when possible.
Definition: Sink.h:24
SmallString - A SmallString is just a SmallVector with methods and accessors that make it work better...
Definition: SmallString.h:26
void append(StringRef RHS)
Append from a StringRef.
Definition: SmallString.h:68
unsigned getMainFileID() const
Definition: SourceMgr.h:132
const MemoryBuffer * getMemoryBuffer(unsigned i) const
Definition: SourceMgr.h:125
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:51
std::pair< StringRef, StringRef > split(char Separator) const
Split into two substrings around the first occurrence of a separator character.
Definition: StringRef.h:700
constexpr bool empty() const
empty - Check if the string is empty.
Definition: StringRef.h:147
bool consume_front(StringRef Prefix)
Returns true if this StringRef has the given prefix and removes that prefix.
Definition: StringRef.h:635
A switch()-like statement whose cases are string literals.
Definition: StringSwitch.h:44
StringSwitch & Case(StringLiteral S, T Value)
Definition: StringSwitch.h:69
R Default(T Value)
Definition: StringSwitch.h:182
StringSwitch & Cases(StringLiteral S0, StringLiteral S1, T Value)
Definition: StringSwitch.h:90
Primary interface to the complete machine description for the target machine.
Definition: TargetMachine.h:77
CodeGenOptLevel getOptLevel() const
Returns the optimization level: None, Less, Default, or Aggressive.
Triple TargetTriple
Triple string, CPU name, and target feature strings the TargetMachine instance is created with.
Definition: TargetMachine.h:96
const Triple & getTargetTriple() const
const MCSubtargetInfo * getMCSubtargetInfo() const
StringRef getTargetFeatureString() const
StringRef getTargetCPU() const
std::unique_ptr< const MCSubtargetInfo > STI
void resetTargetOptions(const Function &F) const
Reset the target options based on the function's attributes.
std::unique_ptr< const MCRegisterInfo > MRI
Target-Independent Code Generator Pass Configuration Options.
virtual void addCodeGenPrepare()
Add pass to prepare the LLVM IR for code generation.
virtual bool addILPOpts()
Add passes that optimize instruction level parallelism for out-of-order targets.
virtual void addPostRegAlloc()
This method may be implemented by targets that want to run passes after register allocation pass pipe...
CodeGenOptLevel getOptLevel() const
virtual void addOptimizedRegAlloc()
addOptimizedRegAlloc - Add passes related to register allocation.
virtual void addIRPasses()
Add common target configurable passes that perform LLVM IR to IR transforms following machine indepen...
virtual void addFastRegAlloc()
addFastRegAlloc - Add the minimum set of target-independent passes that are required for fast registe...
virtual void addMachineSSAOptimization()
addMachineSSAOptimization - Add standard passes that optimize machine instructions in SSA form.
void disablePass(AnalysisID PassID)
Allow the target to disable a specific standard pass by default.
AnalysisID addPass(AnalysisID PassID)
Utilities for targets to add passes to the pass manager.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
TargetSubtargetInfo - Generic base class for all target subtargets.
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
Target - Wrapper for Target specific information.
Triple - Helper class for working with autoconf configuration names.
Definition: Triple.h:44
ArchType getArch() const
Get the parsed architecture type of this triple.
Definition: Triple.h:395
bool isAMDGCN() const
Tests whether the target is AMDGCN.
Definition: Triple.h:880
LLVM Value Representation.
Definition: Value.h:74
bool use_empty() const
Definition: Value.h:344
An efficient, type-erasing, non-owning reference to a callable.
PassManagerBase - An abstract interface to allow code to add passes to a pass manager without having ...
An abstract base class for streams implementations that also support a pwrite operation.
Definition: raw_ostream.h:434
Interfaces for registering analysis passes, producing common pass manager configurations,...
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ CONSTANT_ADDRESS
Address space for constant memory (VTX2).
@ FLAT_ADDRESS
Address space for flat memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
@ PRIVATE_ADDRESS
Address space for private memory.
bool isFlatGlobalAddrSpace(unsigned AS)
bool isEntryFunctionCC(CallingConv::ID CC)
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
BinaryOp_match< LHS, RHS, Instruction::And, true > m_c_And(const LHS &L, const RHS &R)
Matches an And with LHS and RHS in either order.
bool match(Val *V, const Pattern &P)
Definition: PatternMatch.h:49
deferredval_ty< Value > m_Deferred(Value *const &V)
Like m_Specific(), but works if the specific value to match is determined as part of the same match()...
Definition: PatternMatch.h:903
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
Definition: PatternMatch.h:92
BinaryOp_match< cst_pred_ty< is_all_ones >, ValTy, Instruction::Xor, true > m_Not(const ValTy &V)
Matches a 'Not' as 'xor V, -1' or 'xor -1, V'.
ValuesClass values(OptsTy... Options)
Helper to build a ValuesClass by forwarding a variable number of arguments as an initializer list to ...
Definition: CommandLine.h:711
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:443
LocationClass< Ty > location(Ty &L)
Definition: CommandLine.h:463
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
FunctionPass * createFlattenCFGPass()
void initializeSIFormMemoryClausesPass(PassRegistry &)
FunctionPass * createFastRegisterAllocator()
FastRegisterAllocation Pass - This pass register allocates as fast as possible.
char & EarlyMachineLICMID
This pass performs loop invariant code motion on machine instructions.
ImmutablePass * createAMDGPUAAWrapperPass()
char & PostRAHazardRecognizerID
PostRAHazardRecognizer - This pass runs the post-ra hazard recognizer.
std::function< bool(const TargetRegisterInfo &TRI, const MachineRegisterInfo &MRI, const Register Reg)> RegAllocFilterFunc
Filter function for register classes during regalloc.
FunctionPass * createAMDGPUSetWavePriorityPass()
Pass * createLCSSAPass()
Definition: LCSSA.cpp:541
void initializeGCNCreateVOPDPass(PassRegistry &)
char & GCNPreRAOptimizationsID
char & GCLoweringID
GCLowering Pass - Used by gc.root to perform its default lowering operations.
void initializeGCNPreRAOptimizationsPass(PassRegistry &)
Pass * createLoadStoreVectorizerPass()
Create a legacy pass manager instance of the LoadStoreVectorizer pass.
ModulePass * createExpandVariadicsPass(ExpandVariadicsMode)
void initializeGCNRewritePartialRegUsesPass(llvm::PassRegistry &)
void initializeAMDGPUAttributorLegacyPass(PassRegistry &)
char & SIPostRABundlerID
FunctionPass * createSIAnnotateControlFlowLegacyPass()
Create the annotation pass.
FunctionPass * createSIModeRegisterPass()
void initializeSILowerWWMCopiesLegacyPass(PassRegistry &)
FunctionPass * createGreedyRegisterAllocator()
Greedy register allocation pass - This pass implements a global register allocator for optimized buil...
void initializeAMDGPUAAWrapperPassPass(PassRegistry &)
void initializeSIShrinkInstructionsLegacyPass(PassRegistry &)
ModulePass * createAMDGPULowerBufferFatPointersPass()
void initializeR600ClauseMergePassPass(PassRegistry &)
void initializeSIModeRegisterPass(PassRegistry &)
ModulePass * createAMDGPUCtorDtorLoweringLegacyPass()
ModulePass * createAMDGPUSwLowerLDSLegacyPass(const AMDGPUTargetMachine *TM=nullptr)
void initializeAMDGPURewriteUndefForPHILegacyPass(PassRegistry &)
FunctionPass * createAMDGPUPreLegalizeCombiner(bool IsOptNone)
char & GCNRewritePartialRegUsesID
FunctionPass * createAMDGPUPostLegalizeCombiner(bool IsOptNone)
void initializeAMDGPUSwLowerLDSLegacyPass(PassRegistry &)
std::error_code inconvertibleErrorCode()
The value returned by this function can be returned from convertToErrorCode for Error values where no...
Definition: Error.cpp:98
void initializeGCNPreRALongBranchRegPass(PassRegistry &)
void initializeSILowerSGPRSpillsLegacyPass(PassRegistry &)
std::unique_ptr< ScheduleDAGMutation > createIGroupLPDAGMutation(AMDGPU::SchedulingPhase Phase)
Phase specifes whether or not this is a reentry into the IGroupLPDAGMutation.
void initializeAMDGPUDAGToDAGISelLegacyPass(PassRegistry &)
FunctionPass * createNaryReassociatePass()
char & PatchableFunctionID
This pass implements the "patchable-function" attribute.
char & SIOptimizeExecMaskingLegacyID
char & PostRASchedulerID
PostRAScheduler - This pass performs post register allocation scheduling.
void initializeR600ExpandSpecialInstrsPassPass(PassRegistry &)
void initializeR600PacketizerPass(PassRegistry &)
std::unique_ptr< ScheduleDAGMutation > createVOPDPairingMutation()
ModulePass * createAMDGPUAlwaysInlinePass(bool GlobalOpt=true)
void initializeSIPreEmitPeepholePass(PassRegistry &)
void initializeSIFoldOperandsLegacyPass(PassRegistry &)
char & SILoadStoreOptimizerLegacyID
void initializeAMDGPUGlobalISelDivergenceLoweringPass(PassRegistry &)
std::unique_ptr< CSEConfigBase > getStandardCSEConfigForOpt(CodeGenOptLevel Level)
Definition: CSEInfo.cpp:89
Target & getTheR600Target()
The target for R600 GPUs.
char & MachineSchedulerID
MachineScheduler - This pass schedules machine instructions.
Pass * createStructurizeCFGPass(bool SkipUniformRegions=false)
When SkipUniformRegions is true the structizer will not structurize regions that only contain uniform...
void initializeGCNNSAReassignPass(PassRegistry &)
char & PostMachineSchedulerID
PostMachineScheduler - This pass schedules machine instructions postRA.
void initializeAMDGPUOpenCLEnqueuedBlockLoweringLegacyPass(PassRegistry &)
void initializeSIInsertWaitcntsPass(PassRegistry &)
Pass * createLICMPass()
Definition: LICM.cpp:381
ScheduleDAGMILive * createGenericSchedLive(MachineSchedContext *C)
Create the standard converging machine scheduler.
char & SIFormMemoryClausesID
void initializeSILoadStoreOptimizerLegacyPass(PassRegistry &)
void initializeAMDGPULowerModuleLDSLegacyPass(PassRegistry &)
void initializeAMDGPUCtorDtorLoweringLegacyPass(PassRegistry &)
char & EarlyIfConverterLegacyID
EarlyIfConverter - This pass performs if-conversion on SSA form by inserting cmov instructions.
void initializeAMDGPURegBankCombinerPass(PassRegistry &)
void initializeSILateBranchLoweringPass(PassRegistry &)
ThinOrFullLTOPhase
This enumerates the LLVM full LTO or ThinLTO optimization phases.
Definition: Pass.h:76
char & AMDGPUUnifyDivergentExitNodesID
FunctionPass * createAMDGPUAtomicOptimizerPass(ScanOptions ScanStrategy)
FunctionPass * createAMDGPUPreloadKernArgPrologLegacyPass()
char & SIOptimizeVGPRLiveRangeLegacyID
char & ShadowStackGCLoweringID
ShadowStackGCLowering - Implements the custom lowering mechanism used by the shadow stack GC.
char & GCNNSAReassignID
void initializeAMDGPURewriteOutArgumentsPass(PassRegistry &)
void initializeAMDGPUExternalAAWrapperPass(PassRegistry &)
auto formatv(bool Validate, const char *Fmt, Ts &&...Vals)
void initializeAMDGPULowerKernelArgumentsPass(PassRegistry &)
CodeModel::Model getEffectiveCodeModel(std::optional< CodeModel::Model > CM, CodeModel::Model Default)
Helper method for getting the code model, returning Default if CM does not have a value.
char & SILateBranchLoweringPassID
char & BranchRelaxationPassID
BranchRelaxation - This pass replaces branches that need to jump further than is supported by a branc...
FunctionPass * createSinkingPass()
Definition: Sink.cpp:277
CGSCCToFunctionPassAdaptor createCGSCCToFunctionPassAdaptor(FunctionPassT &&Pass, bool EagerlyInvalidate=false, bool NoRerun=false)
A function to deduce a function pass type and wrap it in the templated adaptor.
void initializeAMDGPUAnnotateKernelFeaturesPass(PassRegistry &)
CodeGenFileType
These enums are meant to be passed into addPassesToEmitFile to indicate what type of file to emit,...
Definition: CodeGen.h:83
void initializeSIPostRABundlerPass(PassRegistry &)
void initializeAMDGPUPromoteAllocaToVectorPass(PassRegistry &)
char & GCNDPPCombineLegacyID
void initializeSIWholeQuadModePass(PassRegistry &)
std::unique_ptr< ScheduleDAGMutation > createStoreClusterDAGMutation(const TargetInstrInfo *TII, const TargetRegisterInfo *TRI, bool ReorderWhileClustering=false)
If ReorderWhileClustering is set to true, no attempt will be made to reduce reordering due to store c...
FunctionPass * createLoopDataPrefetchPass()
FunctionPass * createAMDGPULowerKernelArgumentsPass()
char & AMDGPUInsertDelayAluID
Pass * createAMDGPUAnnotateKernelFeaturesPass()
void report_fatal_error(Error Err, bool gen_crash_diag=true)
Report a serious error, calling any installed error handler.
Definition: Error.cpp:167
std::unique_ptr< ScheduleDAGMutation > createAMDGPUMacroFusionDAGMutation()
Note that you have to add: DAG.addMutation(createAMDGPUMacroFusionDAGMutation()); to AMDGPUPassConfig...
char & StackMapLivenessID
StackMapLiveness - This pass analyses the register live-out set of stackmap/patchpoint intrinsics and...
char & SILowerWWMCopiesLegacyID
FunctionPass * createUnifyLoopExitsPass()
char & SIOptimizeExecMaskingPreRAID
FunctionPass * createFixIrreduciblePass()
char & FuncletLayoutID
This pass lays out funclets contiguously.
void initializeSIInsertHardClausesPass(PassRegistry &)
char & DetectDeadLanesID
This pass adds dead/undef flags after analyzing subregister lanes.
void initializeAMDGPUPostLegalizerCombinerPass(PassRegistry &)
CodeGenOptLevel
Code generation optimization level.
Definition: CodeGen.h:54
void initializeAMDGPUReserveWWMRegsPass(PassRegistry &)
ModulePass * createAMDGPUPrintfRuntimeBinding()
char & StackSlotColoringID
StackSlotColoring - This pass performs stack slot coloring.
void initializeSIMemoryLegalizerPass(PassRegistry &)
Pass * createAlwaysInlinerLegacyPass(bool InsertLifetime=true)
Create a legacy pass manager instance of a pass to inline and remove functions marked as "always_inli...
void initializeR600ControlFlowFinalizerPass(PassRegistry &)
void initializeAMDGPUImageIntrinsicOptimizerPass(PassRegistry &)
void initializeSILowerControlFlowLegacyPass(PassRegistry &)
char & SIPreAllocateWWMRegsLegacyID
ModulePass * createAMDGPULowerModuleLDSLegacyPass(const AMDGPUTargetMachine *TM=nullptr)
void initializeAMDGPUPreLegalizerCombinerPass(PassRegistry &)
char & AMDGPUReserveWWMRegsID
FunctionPass * createAMDGPUPromoteAlloca()
FunctionPass * createSeparateConstOffsetFromGEPPass(bool LowerGEP=false)
char & SIPreEmitPeepholeID
ModulePass * createAMDGPURemoveIncompatibleFunctionsPass(const TargetMachine *)
void initializeGCNRegPressurePrinterPass(PassRegistry &)
void initializeSILowerI1CopiesLegacyPass(PassRegistry &)
char & SILowerSGPRSpillsLegacyID
void initializeAMDGPUArgumentUsageInfoPass(PassRegistry &)
FunctionPass * createBasicRegisterAllocator()
BasicRegisterAllocation Pass - This pass implements a degenerate global register allocator using the ...
void initializeGlobalISel(PassRegistry &)
Initialize all passes linked into the GlobalISel library.
Definition: GlobalISel.cpp:17
char & SILowerControlFlowLegacyID
ModulePass * createR600OpenCLImageTypeLoweringPass()
FunctionPass * createAMDGPUCodeGenPreparePass()
void initializeSIAnnotateControlFlowLegacyPass(PassRegistry &)
ModulePass * createAMDGPUOpenCLEnqueuedBlockLoweringLegacyPass()
FunctionPass * createAMDGPUISelDag(TargetMachine &TM, CodeGenOptLevel OptLevel)
This pass converts a legalized DAG into a AMDGPU-specific.
void initializeSIPreAllocateWWMRegsLegacyPass(PassRegistry &)
void initializeSIFixVGPRCopiesLegacyPass(PassRegistry &)
Target & getTheGCNTarget()
The target for GCN GPUs.
void initializeSIFixSGPRCopiesLegacyPass(PassRegistry &)
void initializeAMDGPUAtomicOptimizerPass(PassRegistry &)
FunctionPass * createGVNPass()
Create a legacy GVN pass.
Definition: GVN.cpp:3374
FunctionPass * createAMDGPURegBankSelectPass()
FunctionPass * createAMDGPURegBankCombiner(bool IsOptNone)
FunctionPass * createAMDGPURegBankLegalizePass()
char & MachineCSELegacyID
MachineCSE - This pass performs global CSE on machine instructions.
Definition: MachineCSE.cpp:164
char & SIWholeQuadModeID
std::unique_ptr< ScheduleDAGMutation > createLoadClusterDAGMutation(const TargetInstrInfo *TII, const TargetRegisterInfo *TRI, bool ReorderWhileClustering=false)
If ReorderWhileClustering is set to true, no attempt will be made to reduce reordering due to store c...
void initializeSIOptimizeExecMaskingPreRAPass(PassRegistry &)
void initializeAMDGPUMarkLastScratchLoadPass(PassRegistry &)
char & LiveVariablesID
LiveVariables pass - This pass computes the set of blocks in which each variable is life and sets mac...
void initializeAMDGPUCodeGenPreparePass(PassRegistry &)
FunctionPass * createAMDGPURewriteUndefForPHILegacyPass()
void initializeSIOptimizeExecMaskingLegacyPass(PassRegistry &)
void call_once(once_flag &flag, Function &&F, Args &&... ArgList)
Execute the function specified as a parameter once.
Definition: Threading.h:86
FunctionPass * createSILowerI1CopiesLegacyPass()
void initializeAMDGPULowerKernelAttributesPass(PassRegistry &)
char & SIInsertHardClausesID
void initializeAMDGPUResourceUsageAnalysisPass(PassRegistry &)
char & SIFixSGPRCopiesLegacyID
void initializeGCNDPPCombineLegacyPass(PassRegistry &)
char & GCNCreateVOPDID
FunctionPass * createInferAddressSpacesPass(unsigned AddressSpace=~0u)
char & SIPeepholeSDWALegacyID
char & VirtRegRewriterID
VirtRegRewriter pass.
Definition: VirtRegMap.cpp:250
char & SIFixVGPRCopiesID
char & SIFoldOperandsLegacyID
FunctionPass * createLowerSwitchPass()
void initializeAMDGPUPreloadKernArgPrologLegacyPass(PassRegistry &)
FunctionPass * createVirtRegRewriter(bool ClearVirtRegs=true)
Definition: VirtRegMap.cpp:734
void initializeR600VectorRegMergerPass(PassRegistry &)
ImmutablePass * createExternalAAWrapperPass(std::function< void(Pass &, Function &, AAResults &)> Callback)
A wrapper pass around a callback which can be used to populate the AAResults in the AAResultsWrapperP...
FunctionPass * createAMDGPUGlobalISelDivergenceLoweringPass()
FunctionPass * createSIMemoryLegalizerPass()
void initializeAMDGPULateCodeGenPrepareLegacyPass(PassRegistry &)
void initializeSIOptimizeVGPRLiveRangeLegacyPass(PassRegistry &)
void initializeSIPeepholeSDWALegacyPass(PassRegistry &)
void initializeAMDGPURegBankLegalizePass(PassRegistry &)
char & TwoAddressInstructionPassID
TwoAddressInstruction - This pass reduces two-address instructions to use two operands.
void initializeAMDGPURegBankSelectPass(PassRegistry &)
FunctionPass * createAMDGPULateCodeGenPrepareLegacyPass()
FunctionPass * createAtomicExpandLegacyPass()
AtomicExpandPass - At IR level this pass replace atomic instructions with __atomic_* library calls,...
MCRegisterInfo * createGCNMCRegisterInfo(AMDGPUDwarfFlavour DwarfFlavour)
FunctionPass * createStraightLineStrengthReducePass()
FunctionPass * createAMDGPUImageIntrinsicOptimizerPass(const TargetMachine *)
void initializeAMDGPUUnifyDivergentExitNodesPass(PassRegistry &)
void initializeAMDGPULowerBufferFatPointersPass(PassRegistry &)
FunctionPass * createSIInsertWaitcntsPass()
FunctionPass * createAMDGPUAnnotateUniformValuesLegacy()
FunctionPass * createEarlyCSEPass(bool UseMemorySSA=false)
Definition: EarlyCSE.cpp:1944
char & PHIEliminationID
PHIElimination - This pass eliminates machine instruction PHI nodes by inserting copy instructions.
bool parseNamedRegisterReference(PerFunctionMIParsingState &PFS, Register &Reg, StringRef Src, SMDiagnostic &Error)
Definition: MIParser.cpp:3609
FunctionPass * createSIShrinkInstructionsLegacyPass()
char & AMDGPUMarkLastScratchLoadID
char & RenameIndependentSubregsID
This pass detects subregister lanes in a virtual register that are used independently of other lanes ...
void initializeAMDGPUAnnotateUniformValuesLegacyPass(PassRegistry &)
std::unique_ptr< ScheduleDAGMutation > createAMDGPUExportClusteringDAGMutation()
void initializeAMDGPUPrintfRuntimeBindingPass(PassRegistry &)
void initializeAMDGPUPromoteAllocaPass(PassRegistry &)
void initializeAMDGPURemoveIncompatibleFunctionsLegacyPass(PassRegistry &)
void initializeAMDGPUInsertDelayAluPass(PassRegistry &)
void initializeAMDGPUUnifyMetadataPass(PassRegistry &)
void initializeAMDGPUAlwaysInlinePass(PassRegistry &)
char & DeadMachineInstructionElimID
DeadMachineInstructionElim - This pass removes dead machine instructions.
char & AMDGPUPerfHintAnalysisLegacyID
char & GCNPreRALongBranchRegID
void initializeAMDGPUPromoteKernelArgumentsPass(PassRegistry &)
#define N
static ArgDescriptor createStack(unsigned Offset, unsigned Mask=~0u)
static ArgDescriptor createArg(const ArgDescriptor &Arg, unsigned Mask)
static ArgDescriptor createRegister(Register Reg, unsigned Mask=~0u)
DenormalModeKind Input
Denormal treatment kind for floating point instruction inputs in the default floating-point environme...
@ PreserveSign
The sign of a flushed-to-zero number is preserved in the sign of 0.
@ IEEE
IEEE-754 denormal numbers preserved.
DenormalModeKind Output
Denormal flushing mode for floating point instruction results in the default floating point environme...
A simple and fast domtree-based CSE pass.
Definition: EarlyCSE.h:30
MachineFunctionInfo - This class can be derived from and used by targets to hold private target-speci...
MachineSchedContext provides enough context from the MachineScheduler pass for the target to instanti...
This class manages callbacks registration, as well as provides a way for PassInstrumentation to pass ...
StringMap< VRegInfo * > VRegInfosNamed
Definition: MIParser.h:177
DenseMap< Register, VRegInfo * > VRegInfos
Definition: MIParser.h:176
RegisterTargetMachine - Helper template for registering a target machine implementation,...
A utility pass template to force an analysis result to be available.
Definition: PassManager.h:878
bool DX10Clamp
Used by the vector ALU to force DX10-style treatment of NaNs: when set, clamp NaN to zero; otherwise,...
DenormalMode FP64FP16Denormals
If this is set, neither input or output denormals are flushed for both f64 and f16/v2f16 instructions...
bool IEEE
Floating point opcodes that support exception flag gathering quiet and propagate signaling NaN inputs...
DenormalMode FP32Denormals
If this is set, neither input or output denormals are flushed for most f32 instructions.
The llvm::once_flag structure.
Definition: Threading.h:67
Targets should override this in a way that mirrors the implementation of llvm::MachineFunctionInfo.
SmallVector< StringValue > WWMReservedRegs
std::optional< SIArgumentInfo > ArgInfo
SmallVector< StringValue, 2 > SpillPhysVGPRS
A wrapper around std::string which contains a source range that's being set during parsing.