Bug Summary

File:build/source/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
Warning:line 129, column 5
Value stored to 'Ctor' is never read

Annotated Source Code

Press '?' to see keyboard shortcuts

clang -cc1 -cc1 -triple x86_64-pc-linux-gnu -analyze -disable-free -clear-ast-before-backend -disable-llvm-verifier -discard-value-names -main-file-name AMDGPUTargetMachine.cpp -analyzer-checker=core -analyzer-checker=apiModeling -analyzer-checker=unix -analyzer-checker=deadcode -analyzer-checker=cplusplus -analyzer-checker=security.insecureAPI.UncheckedReturn -analyzer-checker=security.insecureAPI.getpw -analyzer-checker=security.insecureAPI.gets -analyzer-checker=security.insecureAPI.mktemp -analyzer-checker=security.insecureAPI.mkstemp -analyzer-checker=security.insecureAPI.vfork -analyzer-checker=nullability.NullPassedToNonnull -analyzer-checker=nullability.NullReturnedFromNonnull -analyzer-output plist -w -setup-static-analyzer -analyzer-config-compatibility-mode=true -mrelocation-model pic -pic-level 2 -mframe-pointer=none -fmath-errno -ffp-contract=on -fno-rounding-math -mconstructor-aliases -funwind-tables=2 -target-cpu x86-64 -tune-cpu generic -debugger-tuning=gdb -ffunction-sections -fdata-sections -fcoverage-compilation-dir=/build/source/build-llvm/tools/clang/stage2-bins -resource-dir /usr/lib/llvm-17/lib/clang/17 -D _DEBUG -D _GLIBCXX_ASSERTIONS -D _GNU_SOURCE -D _LIBCPP_ENABLE_ASSERTIONS -D __STDC_CONSTANT_MACROS -D __STDC_FORMAT_MACROS -D __STDC_LIMIT_MACROS -I lib/Target/AMDGPU -I /build/source/llvm/lib/Target/AMDGPU -I include -I /build/source/llvm/include -D _FORTIFY_SOURCE=2 -D NDEBUG -U NDEBUG -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../include/c++/10 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../include/x86_64-linux-gnu/c++/10 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../include/c++/10/backward -internal-isystem /usr/lib/llvm-17/lib/clang/17/include -internal-isystem /usr/local/include -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../x86_64-linux-gnu/include -internal-externc-isystem /usr/include/x86_64-linux-gnu -internal-externc-isystem /include -internal-externc-isystem /usr/include -fmacro-prefix-map=/build/source/build-llvm/tools/clang/stage2-bins=build-llvm/tools/clang/stage2-bins -fmacro-prefix-map=/build/source/= -fcoverage-prefix-map=/build/source/build-llvm/tools/clang/stage2-bins=build-llvm/tools/clang/stage2-bins -fcoverage-prefix-map=/build/source/= -source-date-epoch 1679915782 -O2 -Wno-unused-command-line-argument -Wno-unused-parameter -Wwrite-strings -Wno-missing-field-initializers -Wno-long-long -Wno-maybe-uninitialized -Wno-class-memaccess -Wno-redundant-move -Wno-pessimizing-move -Wno-noexcept-type -Wno-comment -Wno-misleading-indentation -std=c++17 -fdeprecated-macro -fdebug-compilation-dir=/build/source/build-llvm/tools/clang/stage2-bins -fdebug-prefix-map=/build/source/build-llvm/tools/clang/stage2-bins=build-llvm/tools/clang/stage2-bins -fdebug-prefix-map=/build/source/= -ferror-limit 19 -fvisibility=hidden -fvisibility-inlines-hidden -stack-protector 2 -fgnuc-version=4.2.1 -fcolor-diagnostics -vectorize-loops -vectorize-slp -analyzer-output=html -analyzer-config stable-report-filename=true -faddrsig -D__GCC_HAVE_DWARF2_CFI_ASM=1 -o /tmp/scan-build-2023-03-27-130437-16335-1 -x c++ /build/source/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
1//===-- AMDGPUTargetMachine.cpp - TargetMachine for hw codegen targets-----===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// The AMDGPU target machine contains all of the hardware specific
11/// information needed to emit code for SI+ GPUs.
12//
13//===----------------------------------------------------------------------===//
14
15#include "AMDGPUTargetMachine.h"
16#include "AMDGPU.h"
17#include "AMDGPUAliasAnalysis.h"
18#include "AMDGPUCtorDtorLowering.h"
19#include "AMDGPUExportClustering.h"
20#include "AMDGPUIGroupLP.h"
21#include "AMDGPUMacroFusion.h"
22#include "AMDGPURegBankSelect.h"
23#include "AMDGPUTargetObjectFile.h"
24#include "AMDGPUTargetTransformInfo.h"
25#include "AMDGPUUnifyDivergentExitNodes.h"
26#include "GCNIterativeScheduler.h"
27#include "GCNSchedStrategy.h"
28#include "GCNVOPDUtils.h"
29#include "R600.h"
30#include "R600MachineFunctionInfo.h"
31#include "R600TargetMachine.h"
32#include "SIMachineFunctionInfo.h"
33#include "SIMachineScheduler.h"
34#include "TargetInfo/AMDGPUTargetInfo.h"
35#include "Utils/AMDGPUBaseInfo.h"
36#include "llvm/Analysis/CGSCCPassManager.h"
37#include "llvm/CodeGen/GlobalISel/CSEInfo.h"
38#include "llvm/CodeGen/GlobalISel/IRTranslator.h"
39#include "llvm/CodeGen/GlobalISel/InstructionSelect.h"
40#include "llvm/CodeGen/GlobalISel/Legalizer.h"
41#include "llvm/CodeGen/GlobalISel/Localizer.h"
42#include "llvm/CodeGen/GlobalISel/RegBankSelect.h"
43#include "llvm/CodeGen/MIRParser/MIParser.h"
44#include "llvm/CodeGen/Passes.h"
45#include "llvm/CodeGen/RegAllocRegistry.h"
46#include "llvm/CodeGen/TargetPassConfig.h"
47#include "llvm/IR/IntrinsicsAMDGPU.h"
48#include "llvm/IR/PassManager.h"
49#include "llvm/IR/PatternMatch.h"
50#include "llvm/InitializePasses.h"
51#include "llvm/MC/TargetRegistry.h"
52#include "llvm/Passes/PassBuilder.h"
53#include "llvm/Transforms/IPO.h"
54#include "llvm/Transforms/IPO/AlwaysInliner.h"
55#include "llvm/Transforms/IPO/GlobalDCE.h"
56#include "llvm/Transforms/IPO/Internalize.h"
57#include "llvm/Transforms/Scalar.h"
58#include "llvm/Transforms/Scalar/GVN.h"
59#include "llvm/Transforms/Scalar/InferAddressSpaces.h"
60#include "llvm/Transforms/Utils.h"
61#include "llvm/Transforms/Utils/SimplifyLibCalls.h"
62#include "llvm/Transforms/Vectorize.h"
63#include <optional>
64
65using namespace llvm;
66using namespace llvm::PatternMatch;
67
68namespace {
69class SGPRRegisterRegAlloc : public RegisterRegAllocBase<SGPRRegisterRegAlloc> {
70public:
71 SGPRRegisterRegAlloc(const char *N, const char *D, FunctionPassCtor C)
72 : RegisterRegAllocBase(N, D, C) {}
73};
74
75class VGPRRegisterRegAlloc : public RegisterRegAllocBase<VGPRRegisterRegAlloc> {
76public:
77 VGPRRegisterRegAlloc(const char *N, const char *D, FunctionPassCtor C)
78 : RegisterRegAllocBase(N, D, C) {}
79};
80
81static bool onlyAllocateSGPRs(const TargetRegisterInfo &TRI,
82 const TargetRegisterClass &RC) {
83 return static_cast<const SIRegisterInfo &>(TRI).isSGPRClass(&RC);
84}
85
86static bool onlyAllocateVGPRs(const TargetRegisterInfo &TRI,
87 const TargetRegisterClass &RC) {
88 return !static_cast<const SIRegisterInfo &>(TRI).isSGPRClass(&RC);
89}
90
91
92/// -{sgpr|vgpr}-regalloc=... command line option.
93static FunctionPass *useDefaultRegisterAllocator() { return nullptr; }
94
95/// A dummy default pass factory indicates whether the register allocator is
96/// overridden on the command line.
97static llvm::once_flag InitializeDefaultSGPRRegisterAllocatorFlag;
98static llvm::once_flag InitializeDefaultVGPRRegisterAllocatorFlag;
99
100static SGPRRegisterRegAlloc
101defaultSGPRRegAlloc("default",
102 "pick SGPR register allocator based on -O option",
103 useDefaultRegisterAllocator);
104
105static cl::opt<SGPRRegisterRegAlloc::FunctionPassCtor, false,
106 RegisterPassParser<SGPRRegisterRegAlloc>>
107SGPRRegAlloc("sgpr-regalloc", cl::Hidden, cl::init(&useDefaultRegisterAllocator),
108 cl::desc("Register allocator to use for SGPRs"));
109
110static cl::opt<VGPRRegisterRegAlloc::FunctionPassCtor, false,
111 RegisterPassParser<VGPRRegisterRegAlloc>>
112VGPRRegAlloc("vgpr-regalloc", cl::Hidden, cl::init(&useDefaultRegisterAllocator),
113 cl::desc("Register allocator to use for VGPRs"));
114
115
116static void initializeDefaultSGPRRegisterAllocatorOnce() {
117 RegisterRegAlloc::FunctionPassCtor Ctor = SGPRRegisterRegAlloc::getDefault();
118
119 if (!Ctor) {
120 Ctor = SGPRRegAlloc;
121 SGPRRegisterRegAlloc::setDefault(SGPRRegAlloc);
122 }
123}
124
125static void initializeDefaultVGPRRegisterAllocatorOnce() {
126 RegisterRegAlloc::FunctionPassCtor Ctor = VGPRRegisterRegAlloc::getDefault();
127
128 if (!Ctor) {
129 Ctor = VGPRRegAlloc;
Value stored to 'Ctor' is never read
130 VGPRRegisterRegAlloc::setDefault(VGPRRegAlloc);
131 }
132}
133
134static FunctionPass *createBasicSGPRRegisterAllocator() {
135 return createBasicRegisterAllocator(onlyAllocateSGPRs);
136}
137
138static FunctionPass *createGreedySGPRRegisterAllocator() {
139 return createGreedyRegisterAllocator(onlyAllocateSGPRs);
140}
141
142static FunctionPass *createFastSGPRRegisterAllocator() {
143 return createFastRegisterAllocator(onlyAllocateSGPRs, false);
144}
145
146static FunctionPass *createBasicVGPRRegisterAllocator() {
147 return createBasicRegisterAllocator(onlyAllocateVGPRs);
148}
149
150static FunctionPass *createGreedyVGPRRegisterAllocator() {
151 return createGreedyRegisterAllocator(onlyAllocateVGPRs);
152}
153
154static FunctionPass *createFastVGPRRegisterAllocator() {
155 return createFastRegisterAllocator(onlyAllocateVGPRs, true);
156}
157
158static SGPRRegisterRegAlloc basicRegAllocSGPR(
159 "basic", "basic register allocator", createBasicSGPRRegisterAllocator);
160static SGPRRegisterRegAlloc greedyRegAllocSGPR(
161 "greedy", "greedy register allocator", createGreedySGPRRegisterAllocator);
162
163static SGPRRegisterRegAlloc fastRegAllocSGPR(
164 "fast", "fast register allocator", createFastSGPRRegisterAllocator);
165
166
167static VGPRRegisterRegAlloc basicRegAllocVGPR(
168 "basic", "basic register allocator", createBasicVGPRRegisterAllocator);
169static VGPRRegisterRegAlloc greedyRegAllocVGPR(
170 "greedy", "greedy register allocator", createGreedyVGPRRegisterAllocator);
171
172static VGPRRegisterRegAlloc fastRegAllocVGPR(
173 "fast", "fast register allocator", createFastVGPRRegisterAllocator);
174}
175
176static cl::opt<bool> EnableSROA(
177 "amdgpu-sroa",
178 cl::desc("Run SROA after promote alloca pass"),
179 cl::ReallyHidden,
180 cl::init(true));
181
182static cl::opt<bool>
183EnableEarlyIfConversion("amdgpu-early-ifcvt", cl::Hidden,
184 cl::desc("Run early if-conversion"),
185 cl::init(false));
186
187static cl::opt<bool>
188OptExecMaskPreRA("amdgpu-opt-exec-mask-pre-ra", cl::Hidden,
189 cl::desc("Run pre-RA exec mask optimizations"),
190 cl::init(true));
191
192// Option to disable vectorizer for tests.
193static cl::opt<bool> EnableLoadStoreVectorizer(
194 "amdgpu-load-store-vectorizer",
195 cl::desc("Enable load store vectorizer"),
196 cl::init(true),
197 cl::Hidden);
198
199// Option to control global loads scalarization
200static cl::opt<bool> ScalarizeGlobal(
201 "amdgpu-scalarize-global-loads",
202 cl::desc("Enable global load scalarization"),
203 cl::init(true),
204 cl::Hidden);
205
206// Option to run internalize pass.
207static cl::opt<bool> InternalizeSymbols(
208 "amdgpu-internalize-symbols",
209 cl::desc("Enable elimination of non-kernel functions and unused globals"),
210 cl::init(false),
211 cl::Hidden);
212
213// Option to inline all early.
214static cl::opt<bool> EarlyInlineAll(
215 "amdgpu-early-inline-all",
216 cl::desc("Inline all functions early"),
217 cl::init(false),
218 cl::Hidden);
219
220static cl::opt<bool> RemoveIncompatibleFunctions(
221 "amdgpu-enable-remove-incompatible-functions", cl::Hidden,
222 cl::desc("Enable removal of functions when they"
223 "use features not supported by the target GPU"),
224 cl::init(true));
225
226static cl::opt<bool> EnableSDWAPeephole(
227 "amdgpu-sdwa-peephole",
228 cl::desc("Enable SDWA peepholer"),
229 cl::init(true));
230
231static cl::opt<bool> EnableDPPCombine(
232 "amdgpu-dpp-combine",
233 cl::desc("Enable DPP combiner"),
234 cl::init(true));
235
236// Enable address space based alias analysis
237static cl::opt<bool> EnableAMDGPUAliasAnalysis("enable-amdgpu-aa", cl::Hidden,
238 cl::desc("Enable AMDGPU Alias Analysis"),
239 cl::init(true));
240
241// Option to run late CFG structurizer
242static cl::opt<bool, true> LateCFGStructurize(
243 "amdgpu-late-structurize",
244 cl::desc("Enable late CFG structurization"),
245 cl::location(AMDGPUTargetMachine::EnableLateStructurizeCFG),
246 cl::Hidden);
247
248// Enable lib calls simplifications
249static cl::opt<bool> EnableLibCallSimplify(
250 "amdgpu-simplify-libcall",
251 cl::desc("Enable amdgpu library simplifications"),
252 cl::init(true),
253 cl::Hidden);
254
255static cl::opt<bool> EnableLowerKernelArguments(
256 "amdgpu-ir-lower-kernel-arguments",
257 cl::desc("Lower kernel argument loads in IR pass"),
258 cl::init(true),
259 cl::Hidden);
260
261static cl::opt<bool> EnableRegReassign(
262 "amdgpu-reassign-regs",
263 cl::desc("Enable register reassign optimizations on gfx10+"),
264 cl::init(true),
265 cl::Hidden);
266
267static cl::opt<bool> OptVGPRLiveRange(
268 "amdgpu-opt-vgpr-liverange",
269 cl::desc("Enable VGPR liverange optimizations for if-else structure"),
270 cl::init(true), cl::Hidden);
271
272// Enable atomic optimization
273static cl::opt<bool> EnableAtomicOptimizations(
274 "amdgpu-atomic-optimizations",
275 cl::desc("Enable atomic optimizations"),
276 cl::init(false),
277 cl::Hidden);
278
279// Enable Mode register optimization
280static cl::opt<bool> EnableSIModeRegisterPass(
281 "amdgpu-mode-register",
282 cl::desc("Enable mode register pass"),
283 cl::init(true),
284 cl::Hidden);
285
286// Enable GFX11+ s_delay_alu insertion
287static cl::opt<bool>
288 EnableInsertDelayAlu("amdgpu-enable-delay-alu",
289 cl::desc("Enable s_delay_alu insertion"),
290 cl::init(true), cl::Hidden);
291
292// Enable GFX11+ VOPD
293static cl::opt<bool>
294 EnableVOPD("amdgpu-enable-vopd",
295 cl::desc("Enable VOPD, dual issue of VALU in wave32"),
296 cl::init(true), cl::Hidden);
297
298// Option is used in lit tests to prevent deadcoding of patterns inspected.
299static cl::opt<bool>
300EnableDCEInRA("amdgpu-dce-in-ra",
301 cl::init(true), cl::Hidden,
302 cl::desc("Enable machine DCE inside regalloc"));
303
304static cl::opt<bool> EnableSetWavePriority("amdgpu-set-wave-priority",
305 cl::desc("Adjust wave priority"),
306 cl::init(false), cl::Hidden);
307
308static cl::opt<bool> EnableScalarIRPasses(
309 "amdgpu-scalar-ir-passes",
310 cl::desc("Enable scalar IR passes"),
311 cl::init(true),
312 cl::Hidden);
313
314static cl::opt<bool> EnableStructurizerWorkarounds(
315 "amdgpu-enable-structurizer-workarounds",
316 cl::desc("Enable workarounds for the StructurizeCFG pass"), cl::init(true),
317 cl::Hidden);
318
319static cl::opt<bool> EnableLDSReplaceWithPointer(
320 "amdgpu-enable-lds-replace-with-pointer",
321 cl::desc("Enable LDS replace with pointer pass"), cl::init(false),
322 cl::Hidden);
323
324static cl::opt<bool, true> EnableLowerModuleLDS(
325 "amdgpu-enable-lower-module-lds", cl::desc("Enable lower module lds pass"),
326 cl::location(AMDGPUTargetMachine::EnableLowerModuleLDS), cl::init(true),
327 cl::Hidden);
328
329static cl::opt<bool> EnablePreRAOptimizations(
330 "amdgpu-enable-pre-ra-optimizations",
331 cl::desc("Enable Pre-RA optimizations pass"), cl::init(true),
332 cl::Hidden);
333
334static cl::opt<bool> EnablePromoteKernelArguments(
335 "amdgpu-enable-promote-kernel-arguments",
336 cl::desc("Enable promotion of flat kernel pointer arguments to global"),
337 cl::Hidden, cl::init(true));
338
339static cl::opt<bool> EnableMaxIlpSchedStrategy(
340 "amdgpu-enable-max-ilp-scheduling-strategy",
341 cl::desc("Enable scheduling strategy to maximize ILP for a single wave."),
342 cl::Hidden, cl::init(false));
343
344extern "C" LLVM_EXTERNAL_VISIBILITY__attribute__((visibility("default"))) void LLVMInitializeAMDGPUTarget() {
345 // Register the target
346 RegisterTargetMachine<R600TargetMachine> X(getTheAMDGPUTarget());
347 RegisterTargetMachine<GCNTargetMachine> Y(getTheGCNTarget());
348
349 PassRegistry *PR = PassRegistry::getPassRegistry();
350 initializeR600ClauseMergePassPass(*PR);
351 initializeR600ControlFlowFinalizerPass(*PR);
352 initializeR600PacketizerPass(*PR);
353 initializeR600ExpandSpecialInstrsPassPass(*PR);
354 initializeR600VectorRegMergerPass(*PR);
355 initializeGlobalISel(*PR);
356 initializeAMDGPUDAGToDAGISelPass(*PR);
357 initializeGCNDPPCombinePass(*PR);
358 initializeSILowerI1CopiesPass(*PR);
359 initializeSILowerSGPRSpillsPass(*PR);
360 initializeSIFixSGPRCopiesPass(*PR);
361 initializeSIFixVGPRCopiesPass(*PR);
362 initializeSIFoldOperandsPass(*PR);
363 initializeSIPeepholeSDWAPass(*PR);
364 initializeSIShrinkInstructionsPass(*PR);
365 initializeSIOptimizeExecMaskingPreRAPass(*PR);
366 initializeSIOptimizeVGPRLiveRangePass(*PR);
367 initializeSILoadStoreOptimizerPass(*PR);
368 initializeAMDGPUCtorDtorLoweringLegacyPass(*PR);
369 initializeAMDGPUAlwaysInlinePass(*PR);
370 initializeAMDGPUAttributorPass(*PR);
371 initializeAMDGPUAnnotateKernelFeaturesPass(*PR);
372 initializeAMDGPUAnnotateUniformValuesPass(*PR);
373 initializeAMDGPUArgumentUsageInfoPass(*PR);
374 initializeAMDGPUAtomicOptimizerPass(*PR);
375 initializeAMDGPULowerKernelArgumentsPass(*PR);
376 initializeAMDGPUPromoteKernelArgumentsPass(*PR);
377 initializeAMDGPULowerKernelAttributesPass(*PR);
378 initializeAMDGPULowerIntrinsicsPass(*PR);
379 initializeAMDGPUOpenCLEnqueuedBlockLoweringPass(*PR);
380 initializeAMDGPUPostLegalizerCombinerPass(*PR);
381 initializeAMDGPUPreLegalizerCombinerPass(*PR);
382 initializeAMDGPURegBankCombinerPass(*PR);
383 initializeAMDGPURegBankSelectPass(*PR);
384 initializeAMDGPUPromoteAllocaPass(*PR);
385 initializeAMDGPUPromoteAllocaToVectorPass(*PR);
386 initializeAMDGPUCodeGenPreparePass(*PR);
387 initializeAMDGPULateCodeGenPreparePass(*PR);
388 initializeAMDGPUPropagateAttributesEarlyPass(*PR);
389 initializeAMDGPUPropagateAttributesLatePass(*PR);
390 initializeAMDGPURemoveIncompatibleFunctionsPass(*PR);
391 initializeAMDGPUReplaceLDSUseWithPointerPass(*PR);
392 initializeAMDGPULowerModuleLDSPass(*PR);
393 initializeAMDGPURewriteOutArgumentsPass(*PR);
394 initializeAMDGPURewriteUndefForPHIPass(*PR);
395 initializeAMDGPUUnifyMetadataPass(*PR);
396 initializeSIAnnotateControlFlowPass(*PR);
397 initializeAMDGPUReleaseVGPRsPass(*PR);
398 initializeAMDGPUInsertDelayAluPass(*PR);
399 initializeSIInsertHardClausesPass(*PR);
400 initializeSIInsertWaitcntsPass(*PR);
401 initializeSIModeRegisterPass(*PR);
402 initializeSIWholeQuadModePass(*PR);
403 initializeSILowerControlFlowPass(*PR);
404 initializeSIPreEmitPeepholePass(*PR);
405 initializeSILateBranchLoweringPass(*PR);
406 initializeSIMemoryLegalizerPass(*PR);
407 initializeSIOptimizeExecMaskingPass(*PR);
408 initializeSIPreAllocateWWMRegsPass(*PR);
409 initializeSIFormMemoryClausesPass(*PR);
410 initializeSIPostRABundlerPass(*PR);
411 initializeGCNCreateVOPDPass(*PR);
412 initializeAMDGPUUnifyDivergentExitNodesPass(*PR);
413 initializeAMDGPUAAWrapperPassPass(*PR);
414 initializeAMDGPUExternalAAWrapperPass(*PR);
415 initializeAMDGPUUseNativeCallsPass(*PR);
416 initializeAMDGPUSimplifyLibCallsPass(*PR);
417 initializeAMDGPUPrintfRuntimeBindingPass(*PR);
418 initializeAMDGPUResourceUsageAnalysisPass(*PR);
419 initializeGCNNSAReassignPass(*PR);
420 initializeGCNPreRAOptimizationsPass(*PR);
421}
422
423static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) {
424 return std::make_unique<AMDGPUTargetObjectFile>();
425}
426
427static ScheduleDAGInstrs *createSIMachineScheduler(MachineSchedContext *C) {
428 return new SIScheduleDAGMI(C);
429}
430
431static ScheduleDAGInstrs *
432createGCNMaxOccupancyMachineScheduler(MachineSchedContext *C) {
433 const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>();
434 ScheduleDAGMILive *DAG =
435 new GCNScheduleDAGMILive(C, std::make_unique<GCNMaxOccupancySchedStrategy>(C));
436 DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
437 if (ST.shouldClusterStores())
438 DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
439 DAG->addMutation(createIGroupLPDAGMutation());
440 DAG->addMutation(createAMDGPUMacroFusionDAGMutation());
441 DAG->addMutation(createAMDGPUExportClusteringDAGMutation());
442 return DAG;
443}
444
445static ScheduleDAGInstrs *
446createGCNMaxILPMachineScheduler(MachineSchedContext *C) {
447 ScheduleDAGMILive *DAG =
448 new GCNScheduleDAGMILive(C, std::make_unique<GCNMaxILPSchedStrategy>(C));
449 DAG->addMutation(createIGroupLPDAGMutation());
450 return DAG;
451}
452
453static ScheduleDAGInstrs *
454createIterativeGCNMaxOccupancyMachineScheduler(MachineSchedContext *C) {
455 const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>();
456 auto DAG = new GCNIterativeScheduler(C,
457 GCNIterativeScheduler::SCHEDULE_LEGACYMAXOCCUPANCY);
458 DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
459 if (ST.shouldClusterStores())
460 DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
461 return DAG;
462}
463
464static ScheduleDAGInstrs *createMinRegScheduler(MachineSchedContext *C) {
465 return new GCNIterativeScheduler(C,
466 GCNIterativeScheduler::SCHEDULE_MINREGFORCED);
467}
468
469static ScheduleDAGInstrs *
470createIterativeILPMachineScheduler(MachineSchedContext *C) {
471 const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>();
472 auto DAG = new GCNIterativeScheduler(C,
473 GCNIterativeScheduler::SCHEDULE_ILP);
474 DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
475 if (ST.shouldClusterStores())
476 DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
477 DAG->addMutation(createAMDGPUMacroFusionDAGMutation());
478 return DAG;
479}
480
481static MachineSchedRegistry
482SISchedRegistry("si", "Run SI's custom scheduler",
483 createSIMachineScheduler);
484
485static MachineSchedRegistry
486GCNMaxOccupancySchedRegistry("gcn-max-occupancy",
487 "Run GCN scheduler to maximize occupancy",
488 createGCNMaxOccupancyMachineScheduler);
489
490static MachineSchedRegistry
491 GCNMaxILPSchedRegistry("gcn-max-ilp", "Run GCN scheduler to maximize ilp",
492 createGCNMaxILPMachineScheduler);
493
494static MachineSchedRegistry IterativeGCNMaxOccupancySchedRegistry(
495 "gcn-iterative-max-occupancy-experimental",
496 "Run GCN scheduler to maximize occupancy (experimental)",
497 createIterativeGCNMaxOccupancyMachineScheduler);
498
499static MachineSchedRegistry GCNMinRegSchedRegistry(
500 "gcn-iterative-minreg",
501 "Run GCN iterative scheduler for minimal register usage (experimental)",
502 createMinRegScheduler);
503
504static MachineSchedRegistry GCNILPSchedRegistry(
505 "gcn-iterative-ilp",
506 "Run GCN iterative scheduler for ILP scheduling (experimental)",
507 createIterativeILPMachineScheduler);
508
509static StringRef computeDataLayout(const Triple &TT) {
510 if (TT.getArch() == Triple::r600) {
511 // 32-bit pointers.
512 return "e-p:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128"
513 "-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1";
514 }
515
516 // 32-bit private, local, and region pointers. 64-bit global, constant and
517 // flat, non-integral buffer fat pointers.
518 return "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32"
519 "-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128"
520 "-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1"
521 "-ni:7";
522}
523
524LLVM_READNONE__attribute__((__const__))
525static StringRef getGPUOrDefault(const Triple &TT, StringRef GPU) {
526 if (!GPU.empty())
527 return GPU;
528
529 // Need to default to a target with flat support for HSA.
530 if (TT.getArch() == Triple::amdgcn)
531 return TT.getOS() == Triple::AMDHSA ? "generic-hsa" : "generic";
532
533 return "r600";
534}
535
536static Reloc::Model getEffectiveRelocModel(std::optional<Reloc::Model> RM) {
537 // The AMDGPU toolchain only supports generating shared objects, so we
538 // must always use PIC.
539 return Reloc::PIC_;
540}
541
542AMDGPUTargetMachine::AMDGPUTargetMachine(const Target &T, const Triple &TT,
543 StringRef CPU, StringRef FS,
544 TargetOptions Options,
545 std::optional<Reloc::Model> RM,
546 std::optional<CodeModel::Model> CM,
547 CodeGenOpt::Level OptLevel)
548 : LLVMTargetMachine(T, computeDataLayout(TT), TT, getGPUOrDefault(TT, CPU),
549 FS, Options, getEffectiveRelocModel(RM),
550 getEffectiveCodeModel(CM, CodeModel::Small), OptLevel),
551 TLOF(createTLOF(getTargetTriple())) {
552 initAsmInfo();
553 if (TT.getArch() == Triple::amdgcn) {
554 if (getMCSubtargetInfo()->checkFeatures("+wavefrontsize64"))
555 MRI.reset(llvm::createGCNMCRegisterInfo(AMDGPUDwarfFlavour::Wave64));
556 else if (getMCSubtargetInfo()->checkFeatures("+wavefrontsize32"))
557 MRI.reset(llvm::createGCNMCRegisterInfo(AMDGPUDwarfFlavour::Wave32));
558 }
559}
560
561bool AMDGPUTargetMachine::EnableLateStructurizeCFG = false;
562bool AMDGPUTargetMachine::EnableFunctionCalls = false;
563bool AMDGPUTargetMachine::EnableLowerModuleLDS = true;
564
565AMDGPUTargetMachine::~AMDGPUTargetMachine() = default;
566
567StringRef AMDGPUTargetMachine::getGPUName(const Function &F) const {
568 Attribute GPUAttr = F.getFnAttribute("target-cpu");
569 return GPUAttr.isValid() ? GPUAttr.getValueAsString() : getTargetCPU();
570}
571
572StringRef AMDGPUTargetMachine::getFeatureString(const Function &F) const {
573 Attribute FSAttr = F.getFnAttribute("target-features");
574
575 return FSAttr.isValid() ? FSAttr.getValueAsString()
576 : getTargetFeatureString();
577}
578
579/// Predicate for Internalize pass.
580static bool mustPreserveGV(const GlobalValue &GV) {
581 if (const Function *F = dyn_cast<Function>(&GV))
582 return F->isDeclaration() || F->getName().startswith("__asan_") ||
583 F->getName().startswith("__sanitizer_") ||
584 AMDGPU::isEntryFunctionCC(F->getCallingConv());
585
586 GV.removeDeadConstantUsers();
587 return !GV.use_empty();
588}
589
590void AMDGPUTargetMachine::registerDefaultAliasAnalyses(AAManager &AAM) {
591 AAM.registerFunctionAnalysis<AMDGPUAA>();
592}
593
594void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) {
595 PB.registerPipelineParsingCallback(
596 [this](StringRef PassName, ModulePassManager &PM,
597 ArrayRef<PassBuilder::PipelineElement>) {
598 if (PassName == "amdgpu-propagate-attributes-late") {
599 PM.addPass(AMDGPUPropagateAttributesLatePass(*this));
600 return true;
601 }
602 if (PassName == "amdgpu-unify-metadata") {
603 PM.addPass(AMDGPUUnifyMetadataPass());
604 return true;
605 }
606 if (PassName == "amdgpu-printf-runtime-binding") {
607 PM.addPass(AMDGPUPrintfRuntimeBindingPass());
608 return true;
609 }
610 if (PassName == "amdgpu-always-inline") {
611 PM.addPass(AMDGPUAlwaysInlinePass());
612 return true;
613 }
614 if (PassName == "amdgpu-replace-lds-use-with-pointer") {
615 PM.addPass(AMDGPUReplaceLDSUseWithPointerPass());
616 return true;
617 }
618 if (PassName == "amdgpu-lower-module-lds") {
619 PM.addPass(AMDGPULowerModuleLDSPass());
620 return true;
621 }
622 if (PassName == "amdgpu-lower-ctor-dtor") {
623 PM.addPass(AMDGPUCtorDtorLoweringPass());
624 return true;
625 }
626 return false;
627 });
628 PB.registerPipelineParsingCallback(
629 [this](StringRef PassName, FunctionPassManager &PM,
630 ArrayRef<PassBuilder::PipelineElement>) {
631 if (PassName == "amdgpu-simplifylib") {
632 PM.addPass(AMDGPUSimplifyLibCallsPass(*this));
633 return true;
634 }
635 if (PassName == "amdgpu-usenative") {
636 PM.addPass(AMDGPUUseNativeCallsPass());
637 return true;
638 }
639 if (PassName == "amdgpu-promote-alloca") {
640 PM.addPass(AMDGPUPromoteAllocaPass(*this));
641 return true;
642 }
643 if (PassName == "amdgpu-promote-alloca-to-vector") {
644 PM.addPass(AMDGPUPromoteAllocaToVectorPass(*this));
645 return true;
646 }
647 if (PassName == "amdgpu-lower-kernel-attributes") {
648 PM.addPass(AMDGPULowerKernelAttributesPass());
649 return true;
650 }
651 if (PassName == "amdgpu-propagate-attributes-early") {
652 PM.addPass(AMDGPUPropagateAttributesEarlyPass(*this));
653 return true;
654 }
655 if (PassName == "amdgpu-promote-kernel-arguments") {
656 PM.addPass(AMDGPUPromoteKernelArgumentsPass());
657 return true;
658 }
659 if (PassName == "amdgpu-unify-divergent-exit-nodes") {
660 PM.addPass(AMDGPUUnifyDivergentExitNodesPass());
661 return true;
662 }
663 return false;
664 });
665
666 PB.registerAnalysisRegistrationCallback([](FunctionAnalysisManager &FAM) {
667 FAM.registerPass([&] { return AMDGPUAA(); });
668 });
669
670 PB.registerParseAACallback([](StringRef AAName, AAManager &AAM) {
671 if (AAName == "amdgpu-aa") {
672 AAM.registerFunctionAnalysis<AMDGPUAA>();
673 return true;
674 }
675 return false;
676 });
677
678 PB.registerPipelineStartEPCallback(
679 [this](ModulePassManager &PM, OptimizationLevel Level) {
680 FunctionPassManager FPM;
681 FPM.addPass(AMDGPUPropagateAttributesEarlyPass(*this));
682 FPM.addPass(AMDGPUUseNativeCallsPass());
683 if (EnableLibCallSimplify && Level != OptimizationLevel::O0)
684 FPM.addPass(AMDGPUSimplifyLibCallsPass(*this));
685 PM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM)));
686 });
687
688 PB.registerPipelineEarlySimplificationEPCallback(
689 [this](ModulePassManager &PM, OptimizationLevel Level) {
690 PM.addPass(AMDGPUPrintfRuntimeBindingPass());
691
692 if (Level == OptimizationLevel::O0)
693 return;
694
695 PM.addPass(AMDGPUUnifyMetadataPass());
696
697 if (InternalizeSymbols) {
698 PM.addPass(InternalizePass(mustPreserveGV));
699 }
700 PM.addPass(AMDGPUPropagateAttributesLatePass(*this));
701 if (InternalizeSymbols) {
702 PM.addPass(GlobalDCEPass());
703 }
704 if (EarlyInlineAll && !EnableFunctionCalls)
705 PM.addPass(AMDGPUAlwaysInlinePass());
706 });
707
708 PB.registerCGSCCOptimizerLateEPCallback(
709 [this](CGSCCPassManager &PM, OptimizationLevel Level) {
710 if (Level == OptimizationLevel::O0)
711 return;
712
713 FunctionPassManager FPM;
714
715 // Add promote kernel arguments pass to the opt pipeline right before
716 // infer address spaces which is needed to do actual address space
717 // rewriting.
718 if (Level.getSpeedupLevel() > OptimizationLevel::O1.getSpeedupLevel() &&
719 EnablePromoteKernelArguments)
720 FPM.addPass(AMDGPUPromoteKernelArgumentsPass());
721
722 // Add infer address spaces pass to the opt pipeline after inlining
723 // but before SROA to increase SROA opportunities.
724 FPM.addPass(InferAddressSpacesPass());
725
726 // This should run after inlining to have any chance of doing
727 // anything, and before other cleanup optimizations.
728 FPM.addPass(AMDGPULowerKernelAttributesPass());
729
730 if (Level != OptimizationLevel::O0) {
731 // Promote alloca to vector before SROA and loop unroll. If we
732 // manage to eliminate allocas before unroll we may choose to unroll
733 // less.
734 FPM.addPass(AMDGPUPromoteAllocaToVectorPass(*this));
735 }
736
737 PM.addPass(createCGSCCToFunctionPassAdaptor(std::move(FPM)));
738 });
739}
740
741int64_t AMDGPUTargetMachine::getNullPointerValue(unsigned AddrSpace) {
742 return (AddrSpace == AMDGPUAS::LOCAL_ADDRESS ||
743 AddrSpace == AMDGPUAS::PRIVATE_ADDRESS ||
744 AddrSpace == AMDGPUAS::REGION_ADDRESS)
745 ? -1
746 : 0;
747}
748
749bool AMDGPUTargetMachine::isNoopAddrSpaceCast(unsigned SrcAS,
750 unsigned DestAS) const {
751 return AMDGPU::isFlatGlobalAddrSpace(SrcAS) &&
752 AMDGPU::isFlatGlobalAddrSpace(DestAS);
753}
754
755unsigned AMDGPUTargetMachine::getAssumedAddrSpace(const Value *V) const {
756 const auto *LD = dyn_cast<LoadInst>(V);
757 if (!LD)
758 return AMDGPUAS::UNKNOWN_ADDRESS_SPACE;
759
760 // It must be a generic pointer loaded.
761 assert(V->getType()->isPointerTy() &&(static_cast <bool> (V->getType()->isPointerTy() &&
V->getType()->getPointerAddressSpace() == AMDGPUAS::FLAT_ADDRESS
) ? void (0) : __assert_fail ("V->getType()->isPointerTy() && V->getType()->getPointerAddressSpace() == AMDGPUAS::FLAT_ADDRESS"
, "llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp", 762, __extension__
__PRETTY_FUNCTION__))
762 V->getType()->getPointerAddressSpace() == AMDGPUAS::FLAT_ADDRESS)(static_cast <bool> (V->getType()->isPointerTy() &&
V->getType()->getPointerAddressSpace() == AMDGPUAS::FLAT_ADDRESS
) ? void (0) : __assert_fail ("V->getType()->isPointerTy() && V->getType()->getPointerAddressSpace() == AMDGPUAS::FLAT_ADDRESS"
, "llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp", 762, __extension__
__PRETTY_FUNCTION__))
;
763
764 const auto *Ptr = LD->getPointerOperand();
765 if (Ptr->getType()->getPointerAddressSpace() != AMDGPUAS::CONSTANT_ADDRESS)
766 return AMDGPUAS::UNKNOWN_ADDRESS_SPACE;
767 // For a generic pointer loaded from the constant memory, it could be assumed
768 // as a global pointer since the constant memory is only populated on the
769 // host side. As implied by the offload programming model, only global
770 // pointers could be referenced on the host side.
771 return AMDGPUAS::GLOBAL_ADDRESS;
772}
773
774std::pair<const Value *, unsigned>
775AMDGPUTargetMachine::getPredicatedAddrSpace(const Value *V) const {
776 if (auto *II = dyn_cast<IntrinsicInst>(V)) {
777 switch (II->getIntrinsicID()) {
778 case Intrinsic::amdgcn_is_shared:
779 return std::pair(II->getArgOperand(0), AMDGPUAS::LOCAL_ADDRESS);
780 case Intrinsic::amdgcn_is_private:
781 return std::pair(II->getArgOperand(0), AMDGPUAS::PRIVATE_ADDRESS);
782 default:
783 break;
784 }
785 return std::pair(nullptr, -1);
786 }
787 // Check the global pointer predication based on
788 // (!is_share(p) && !is_private(p)). Note that logic 'and' is commutative and
789 // the order of 'is_shared' and 'is_private' is not significant.
790 Value *Ptr;
791 if (match(
792 const_cast<Value *>(V),
793 m_c_And(m_Not(m_Intrinsic<Intrinsic::amdgcn_is_shared>(m_Value(Ptr))),
794 m_Not(m_Intrinsic<Intrinsic::amdgcn_is_private>(
795 m_Deferred(Ptr))))))
796 return std::pair(Ptr, AMDGPUAS::GLOBAL_ADDRESS);
797
798 return std::pair(nullptr, -1);
799}
800
801unsigned
802AMDGPUTargetMachine::getAddressSpaceForPseudoSourceKind(unsigned Kind) const {
803 switch (Kind) {
804 case PseudoSourceValue::Stack:
805 case PseudoSourceValue::FixedStack:
806 return AMDGPUAS::PRIVATE_ADDRESS;
807 case PseudoSourceValue::ConstantPool:
808 case PseudoSourceValue::GOT:
809 case PseudoSourceValue::JumpTable:
810 case PseudoSourceValue::GlobalValueCallEntry:
811 case PseudoSourceValue::ExternalSymbolCallEntry:
812 return AMDGPUAS::CONSTANT_ADDRESS;
813 }
814 return AMDGPUAS::FLAT_ADDRESS;
815}
816
817//===----------------------------------------------------------------------===//
818// GCN Target Machine (SI+)
819//===----------------------------------------------------------------------===//
820
821GCNTargetMachine::GCNTargetMachine(const Target &T, const Triple &TT,
822 StringRef CPU, StringRef FS,
823 TargetOptions Options,
824 std::optional<Reloc::Model> RM,
825 std::optional<CodeModel::Model> CM,
826 CodeGenOpt::Level OL, bool JIT)
827 : AMDGPUTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL) {}
828
829const TargetSubtargetInfo *
830GCNTargetMachine::getSubtargetImpl(const Function &F) const {
831 StringRef GPU = getGPUName(F);
832 StringRef FS = getFeatureString(F);
833
834 SmallString<128> SubtargetKey(GPU);
835 SubtargetKey.append(FS);
836
837 auto &I = SubtargetMap[SubtargetKey];
838 if (!I) {
839 // This needs to be done before we create a new subtarget since any
840 // creation will depend on the TM and the code generation flags on the
841 // function that reside in TargetOptions.
842 resetTargetOptions(F);
843 I = std::make_unique<GCNSubtarget>(TargetTriple, GPU, FS, *this);
844 }
845
846 I->setScalarizeGlobalBehavior(ScalarizeGlobal);
847
848 return I.get();
849}
850
851TargetTransformInfo
852GCNTargetMachine::getTargetTransformInfo(const Function &F) const {
853 return TargetTransformInfo(GCNTTIImpl(this, F));
854}
855
856//===----------------------------------------------------------------------===//
857// AMDGPU Pass Setup
858//===----------------------------------------------------------------------===//
859
860std::unique_ptr<CSEConfigBase> llvm::AMDGPUPassConfig::getCSEConfig() const {
861 return getStandardCSEConfigForOpt(TM->getOptLevel());
862}
863
864namespace {
865
866class GCNPassConfig final : public AMDGPUPassConfig {
867public:
868 GCNPassConfig(LLVMTargetMachine &TM, PassManagerBase &PM)
869 : AMDGPUPassConfig(TM, PM) {
870 // It is necessary to know the register usage of the entire call graph. We
871 // allow calls without EnableAMDGPUFunctionCalls if they are marked
872 // noinline, so this is always required.
873 setRequiresCodeGenSCCOrder(true);
874 substitutePass(&PostRASchedulerID, &PostMachineSchedulerID);
875 }
876
877 GCNTargetMachine &getGCNTargetMachine() const {
878 return getTM<GCNTargetMachine>();
879 }
880
881 ScheduleDAGInstrs *
882 createMachineScheduler(MachineSchedContext *C) const override;
883
884 ScheduleDAGInstrs *
885 createPostMachineScheduler(MachineSchedContext *C) const override {
886 ScheduleDAGMI *DAG = new GCNPostScheduleDAGMILive(
887 C, std::make_unique<PostGenericScheduler>(C),
888 /*RemoveKillFlags=*/true);
889 const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>();
890 DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
891 if (ST.shouldClusterStores())
892 DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
893 DAG->addMutation(ST.createFillMFMAShadowMutation(DAG->TII));
894 DAG->addMutation(createIGroupLPDAGMutation());
895 if (isPassEnabled(EnableVOPD, CodeGenOpt::Less))
896 DAG->addMutation(createVOPDPairingMutation());
897 return DAG;
898 }
899
900 bool addPreISel() override;
901 void addMachineSSAOptimization() override;
902 bool addILPOpts() override;
903 bool addInstSelector() override;
904 bool addIRTranslator() override;
905 void addPreLegalizeMachineIR() override;
906 bool addLegalizeMachineIR() override;
907 void addPreRegBankSelect() override;
908 bool addRegBankSelect() override;
909 void addPreGlobalInstructionSelect() override;
910 bool addGlobalInstructionSelect() override;
911 void addFastRegAlloc() override;
912 void addOptimizedRegAlloc() override;
913
914 FunctionPass *createSGPRAllocPass(bool Optimized);
915 FunctionPass *createVGPRAllocPass(bool Optimized);
916 FunctionPass *createRegAllocPass(bool Optimized) override;
917
918 bool addRegAssignAndRewriteFast() override;
919 bool addRegAssignAndRewriteOptimized() override;
920
921 void addPreRegAlloc() override;
922 bool addPreRewrite() override;
923 void addPostRegAlloc() override;
924 void addPreSched2() override;
925 void addPreEmitPass() override;
926};
927
928} // end anonymous namespace
929
930AMDGPUPassConfig::AMDGPUPassConfig(LLVMTargetMachine &TM, PassManagerBase &PM)
931 : TargetPassConfig(TM, PM) {
932 // Exceptions and StackMaps are not supported, so these passes will never do
933 // anything.
934 disablePass(&StackMapLivenessID);
935 disablePass(&FuncletLayoutID);
936 // Garbage collection is not supported.
937 disablePass(&GCLoweringID);
938 disablePass(&ShadowStackGCLoweringID);
939}
940
941void AMDGPUPassConfig::addEarlyCSEOrGVNPass() {
942 if (getOptLevel() == CodeGenOpt::Aggressive)
943 addPass(createGVNPass());
944 else
945 addPass(createEarlyCSEPass());
946}
947
948void AMDGPUPassConfig::addStraightLineScalarOptimizationPasses() {
949 addPass(createLICMPass());
950 addPass(createSeparateConstOffsetFromGEPPass());
951 // ReassociateGEPs exposes more opportunities for SLSR. See
952 // the example in reassociate-geps-and-slsr.ll.
953 addPass(createStraightLineStrengthReducePass());
954 // SeparateConstOffsetFromGEP and SLSR creates common expressions which GVN or
955 // EarlyCSE can reuse.
956 addEarlyCSEOrGVNPass();
957 // Run NaryReassociate after EarlyCSE/GVN to be more effective.
958 addPass(createNaryReassociatePass());
959 // NaryReassociate on GEPs creates redundant common expressions, so run
960 // EarlyCSE after it.
961 addPass(createEarlyCSEPass());
962}
963
964void AMDGPUPassConfig::addIRPasses() {
965 const AMDGPUTargetMachine &TM = getAMDGPUTargetMachine();
966
967 // There is no reason to run these.
968 disablePass(&StackMapLivenessID);
969 disablePass(&FuncletLayoutID);
970 disablePass(&PatchableFunctionID);
971
972 addPass(createAMDGPUPrintfRuntimeBinding());
973 addPass(createAMDGPUCtorDtorLoweringLegacyPass());
974
975 // A call to propagate attributes pass in the backend in case opt was not run.
976 addPass(createAMDGPUPropagateAttributesEarlyPass(&TM));
977
978 addPass(createAMDGPULowerIntrinsicsPass());
979
980 // Function calls are not supported, so make sure we inline everything.
981 addPass(createAMDGPUAlwaysInlinePass());
982 addPass(createAlwaysInlinerLegacyPass());
983
984 // Handle uses of OpenCL image2d_t, image3d_t and sampler_t arguments.
985 if (TM.getTargetTriple().getArch() == Triple::r600)
986 addPass(createR600OpenCLImageTypeLoweringPass());
987
988 // Replace OpenCL enqueued block function pointers with global variables.
989 addPass(createAMDGPUOpenCLEnqueuedBlockLoweringPass());
990
991 // Can increase LDS used by kernel so runs before PromoteAlloca
992 if (EnableLowerModuleLDS) {
993 // The pass "amdgpu-replace-lds-use-with-pointer" need to be run before the
994 // pass "amdgpu-lower-module-lds", and also it required to be run only if
995 // "amdgpu-lower-module-lds" pass is enabled.
996 if (EnableLDSReplaceWithPointer)
997 addPass(createAMDGPUReplaceLDSUseWithPointerPass());
998
999 addPass(createAMDGPULowerModuleLDSPass());
1000 }
1001
1002 if (TM.getOptLevel() > CodeGenOpt::None)
1003 addPass(createInferAddressSpacesPass());
1004
1005 addPass(createAtomicExpandPass());
1006
1007 if (TM.getOptLevel() > CodeGenOpt::None) {
1008 addPass(createAMDGPUPromoteAlloca());
1009
1010 if (EnableSROA)
1011 addPass(createSROAPass());
1012 if (isPassEnabled(EnableScalarIRPasses))
1013 addStraightLineScalarOptimizationPasses();
1014
1015 if (EnableAMDGPUAliasAnalysis) {
1016 addPass(createAMDGPUAAWrapperPass());
1017 addPass(createExternalAAWrapperPass([](Pass &P, Function &,
1018 AAResults &AAR) {
1019 if (auto *WrapperPass = P.getAnalysisIfAvailable<AMDGPUAAWrapperPass>())
1020 AAR.addAAResult(WrapperPass->getResult());
1021 }));
1022 }
1023
1024 if (TM.getTargetTriple().getArch() == Triple::amdgcn) {
1025 // TODO: May want to move later or split into an early and late one.
1026 addPass(createAMDGPUCodeGenPreparePass());
1027 }
1028 }
1029
1030 TargetPassConfig::addIRPasses();
1031
1032 // EarlyCSE is not always strong enough to clean up what LSR produces. For
1033 // example, GVN can combine
1034 //
1035 // %0 = add %a, %b
1036 // %1 = add %b, %a
1037 //
1038 // and
1039 //
1040 // %0 = shl nsw %a, 2
1041 // %1 = shl %a, 2
1042 //
1043 // but EarlyCSE can do neither of them.
1044 if (isPassEnabled(EnableScalarIRPasses))
1045 addEarlyCSEOrGVNPass();
1046}
1047
1048void AMDGPUPassConfig::addCodeGenPrepare() {
1049 if (TM->getTargetTriple().getArch() == Triple::amdgcn) {
1050 if (RemoveIncompatibleFunctions)
1051 addPass(createAMDGPURemoveIncompatibleFunctionsPass(TM));
1052
1053 addPass(createAMDGPUAttributorPass());
1054
1055 // FIXME: This pass adds 2 hacky attributes that can be replaced with an
1056 // analysis, and should be removed.
1057 addPass(createAMDGPUAnnotateKernelFeaturesPass());
1058 }
1059
1060 if (TM->getTargetTriple().getArch() == Triple::amdgcn &&
1061 EnableLowerKernelArguments)
1062 addPass(createAMDGPULowerKernelArgumentsPass());
1063
1064 TargetPassConfig::addCodeGenPrepare();
1065
1066 if (isPassEnabled(EnableLoadStoreVectorizer))
1067 addPass(createLoadStoreVectorizerPass());
1068
1069 // LowerSwitch pass may introduce unreachable blocks that can
1070 // cause unexpected behavior for subsequent passes. Placing it
1071 // here seems better that these blocks would get cleaned up by
1072 // UnreachableBlockElim inserted next in the pass flow.
1073 addPass(createLowerSwitchPass());
1074}
1075
1076bool AMDGPUPassConfig::addPreISel() {
1077 if (TM->getOptLevel() > CodeGenOpt::None)
1078 addPass(createFlattenCFGPass());
1079 return false;
1080}
1081
1082bool AMDGPUPassConfig::addInstSelector() {
1083 addPass(createAMDGPUISelDag(getAMDGPUTargetMachine(), getOptLevel()));
1084 return false;
1085}
1086
1087bool AMDGPUPassConfig::addGCPasses() {
1088 // Do nothing. GC is not supported.
1089 return false;
1090}
1091
1092llvm::ScheduleDAGInstrs *
1093AMDGPUPassConfig::createMachineScheduler(MachineSchedContext *C) const {
1094 const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>();
1095 ScheduleDAGMILive *DAG = createGenericSchedLive(C);
1096 DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
1097 if (ST.shouldClusterStores())
1098 DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
1099 return DAG;
1100}
1101
1102MachineFunctionInfo *R600TargetMachine::createMachineFunctionInfo(
1103 BumpPtrAllocator &Allocator, const Function &F,
1104 const TargetSubtargetInfo *STI) const {
1105 return R600MachineFunctionInfo::create<R600MachineFunctionInfo>(
1106 Allocator, F, static_cast<const R600Subtarget *>(STI));
1107}
1108
1109//===----------------------------------------------------------------------===//
1110// GCN Pass Setup
1111//===----------------------------------------------------------------------===//
1112
1113ScheduleDAGInstrs *GCNPassConfig::createMachineScheduler(
1114 MachineSchedContext *C) const {
1115 const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>();
1116 if (ST.enableSIScheduler())
1117 return createSIMachineScheduler(C);
1118
1119 if (EnableMaxIlpSchedStrategy)
1120 return createGCNMaxILPMachineScheduler(C);
1121
1122 return createGCNMaxOccupancyMachineScheduler(C);
1123}
1124
1125bool GCNPassConfig::addPreISel() {
1126 AMDGPUPassConfig::addPreISel();
1127
1128 if (TM->getOptLevel() > CodeGenOpt::None)
1129 addPass(createAMDGPULateCodeGenPreparePass());
1130
1131 if (isPassEnabled(EnableAtomicOptimizations, CodeGenOpt::Less)) {
1132 addPass(createAMDGPUAtomicOptimizerPass());
1133 }
1134
1135 if (TM->getOptLevel() > CodeGenOpt::None)
1136 addPass(createSinkingPass());
1137
1138 // Merge divergent exit nodes. StructurizeCFG won't recognize the multi-exit
1139 // regions formed by them.
1140 addPass(&AMDGPUUnifyDivergentExitNodesID);
1141 if (!LateCFGStructurize) {
1142 if (EnableStructurizerWorkarounds) {
1143 addPass(createFixIrreduciblePass());
1144 addPass(createUnifyLoopExitsPass());
1145 }
1146 addPass(createStructurizeCFGPass(false)); // true -> SkipUniformRegions
1147 }
1148 addPass(createAMDGPUAnnotateUniformValues());
1149 if (!LateCFGStructurize) {
1150 addPass(createSIAnnotateControlFlowPass());
1151 // TODO: Move this right after structurizeCFG to avoid extra divergence
1152 // analysis. This depends on stopping SIAnnotateControlFlow from making
1153 // control flow modifications.
1154 addPass(createAMDGPURewriteUndefForPHIPass());
1155 }
1156 addPass(createLCSSAPass());
1157
1158 if (TM->getOptLevel() > CodeGenOpt::Less)
1159 addPass(&AMDGPUPerfHintAnalysisID);
1160
1161 return false;
1162}
1163
1164void GCNPassConfig::addMachineSSAOptimization() {
1165 TargetPassConfig::addMachineSSAOptimization();
1166
1167 // We want to fold operands after PeepholeOptimizer has run (or as part of
1168 // it), because it will eliminate extra copies making it easier to fold the
1169 // real source operand. We want to eliminate dead instructions after, so that
1170 // we see fewer uses of the copies. We then need to clean up the dead
1171 // instructions leftover after the operands are folded as well.
1172 //
1173 // XXX - Can we get away without running DeadMachineInstructionElim again?
1174 addPass(&SIFoldOperandsID);
1175 if (EnableDPPCombine)
1176 addPass(&GCNDPPCombineID);
1177 addPass(&SILoadStoreOptimizerID);
1178 if (isPassEnabled(EnableSDWAPeephole)) {
1179 addPass(&SIPeepholeSDWAID);
1180 addPass(&EarlyMachineLICMID);
1181 addPass(&MachineCSEID);
1182 addPass(&SIFoldOperandsID);
1183 }
1184 addPass(&DeadMachineInstructionElimID);
1185 addPass(createSIShrinkInstructionsPass());
1186}
1187
1188bool GCNPassConfig::addILPOpts() {
1189 if (EnableEarlyIfConversion)
1190 addPass(&EarlyIfConverterID);
1191
1192 TargetPassConfig::addILPOpts();
1193 return false;
1194}
1195
1196bool GCNPassConfig::addInstSelector() {
1197 AMDGPUPassConfig::addInstSelector();
1198 addPass(&SIFixSGPRCopiesID);
1199 addPass(createSILowerI1CopiesPass());
1200 return false;
1201}
1202
1203bool GCNPassConfig::addIRTranslator() {
1204 addPass(new IRTranslator(getOptLevel()));
1205 return false;
1206}
1207
1208void GCNPassConfig::addPreLegalizeMachineIR() {
1209 bool IsOptNone = getOptLevel() == CodeGenOpt::None;
1210 addPass(createAMDGPUPreLegalizeCombiner(IsOptNone));
1211 addPass(new Localizer());
1212}
1213
1214bool GCNPassConfig::addLegalizeMachineIR() {
1215 addPass(new Legalizer());
1216 return false;
1217}
1218
1219void GCNPassConfig::addPreRegBankSelect() {
1220 bool IsOptNone = getOptLevel() == CodeGenOpt::None;
1221 addPass(createAMDGPUPostLegalizeCombiner(IsOptNone));
1222}
1223
1224bool GCNPassConfig::addRegBankSelect() {
1225 addPass(new AMDGPURegBankSelect());
1226 return false;
1227}
1228
1229void GCNPassConfig::addPreGlobalInstructionSelect() {
1230 bool IsOptNone = getOptLevel() == CodeGenOpt::None;
1231 addPass(createAMDGPURegBankCombiner(IsOptNone));
1232}
1233
1234bool GCNPassConfig::addGlobalInstructionSelect() {
1235 addPass(new InstructionSelect(getOptLevel()));
1236 return false;
1237}
1238
1239void GCNPassConfig::addPreRegAlloc() {
1240 if (LateCFGStructurize) {
1241 addPass(createAMDGPUMachineCFGStructurizerPass());
1242 }
1243}
1244
1245void GCNPassConfig::addFastRegAlloc() {
1246 // FIXME: We have to disable the verifier here because of PHIElimination +
1247 // TwoAddressInstructions disabling it.
1248
1249 // This must be run immediately after phi elimination and before
1250 // TwoAddressInstructions, otherwise the processing of the tied operand of
1251 // SI_ELSE will introduce a copy of the tied operand source after the else.
1252 insertPass(&PHIEliminationID, &SILowerControlFlowID);
1253
1254 insertPass(&TwoAddressInstructionPassID, &SIWholeQuadModeID);
1255 insertPass(&TwoAddressInstructionPassID, &SIPreAllocateWWMRegsID);
1256
1257 TargetPassConfig::addFastRegAlloc();
1258}
1259
1260void GCNPassConfig::addOptimizedRegAlloc() {
1261 // Allow the scheduler to run before SIWholeQuadMode inserts exec manipulation
1262 // instructions that cause scheduling barriers.
1263 insertPass(&MachineSchedulerID, &SIWholeQuadModeID);
1264 insertPass(&MachineSchedulerID, &SIPreAllocateWWMRegsID);
1265
1266 if (OptExecMaskPreRA)
1267 insertPass(&MachineSchedulerID, &SIOptimizeExecMaskingPreRAID);
1268
1269 if (isPassEnabled(EnablePreRAOptimizations))
1270 insertPass(&RenameIndependentSubregsID, &GCNPreRAOptimizationsID);
1271
1272 // This is not an essential optimization and it has a noticeable impact on
1273 // compilation time, so we only enable it from O2.
1274 if (TM->getOptLevel() > CodeGenOpt::Less)
1275 insertPass(&MachineSchedulerID, &SIFormMemoryClausesID);
1276
1277 // FIXME: when an instruction has a Killed operand, and the instruction is
1278 // inside a bundle, seems only the BUNDLE instruction appears as the Kills of
1279 // the register in LiveVariables, this would trigger a failure in verifier,
1280 // we should fix it and enable the verifier.
1281 if (OptVGPRLiveRange)
1282 insertPass(&LiveVariablesID, &SIOptimizeVGPRLiveRangeID);
1283 // This must be run immediately after phi elimination and before
1284 // TwoAddressInstructions, otherwise the processing of the tied operand of
1285 // SI_ELSE will introduce a copy of the tied operand source after the else.
1286 insertPass(&PHIEliminationID, &SILowerControlFlowID);
1287
1288 if (EnableDCEInRA)
1289 insertPass(&DetectDeadLanesID, &DeadMachineInstructionElimID);
1290
1291 TargetPassConfig::addOptimizedRegAlloc();
1292}
1293
1294bool GCNPassConfig::addPreRewrite() {
1295 if (EnableRegReassign)
1296 addPass(&GCNNSAReassignID);
1297 return true;
1298}
1299
1300FunctionPass *GCNPassConfig::createSGPRAllocPass(bool Optimized) {
1301 // Initialize the global default.
1302 llvm::call_once(InitializeDefaultSGPRRegisterAllocatorFlag,
1303 initializeDefaultSGPRRegisterAllocatorOnce);
1304
1305 RegisterRegAlloc::FunctionPassCtor Ctor = SGPRRegisterRegAlloc::getDefault();
1306 if (Ctor != useDefaultRegisterAllocator)
1307 return Ctor();
1308
1309 if (Optimized)
1310 return createGreedyRegisterAllocator(onlyAllocateSGPRs);
1311
1312 return createFastRegisterAllocator(onlyAllocateSGPRs, false);
1313}
1314
1315FunctionPass *GCNPassConfig::createVGPRAllocPass(bool Optimized) {
1316 // Initialize the global default.
1317 llvm::call_once(InitializeDefaultVGPRRegisterAllocatorFlag,
1318 initializeDefaultVGPRRegisterAllocatorOnce);
1319
1320 RegisterRegAlloc::FunctionPassCtor Ctor = VGPRRegisterRegAlloc::getDefault();
1321 if (Ctor != useDefaultRegisterAllocator)
1322 return Ctor();
1323
1324 if (Optimized)
1325 return createGreedyVGPRRegisterAllocator();
1326
1327 return createFastVGPRRegisterAllocator();
1328}
1329
1330FunctionPass *GCNPassConfig::createRegAllocPass(bool Optimized) {
1331 llvm_unreachable("should not be used")::llvm::llvm_unreachable_internal("should not be used", "llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp"
, 1331)
;
1332}
1333
1334static const char RegAllocOptNotSupportedMessage[] =
1335 "-regalloc not supported with amdgcn. Use -sgpr-regalloc and -vgpr-regalloc";
1336
1337bool GCNPassConfig::addRegAssignAndRewriteFast() {
1338 if (!usingDefaultRegAlloc())
1339 report_fatal_error(RegAllocOptNotSupportedMessage);
1340
1341 addPass(createSGPRAllocPass(false));
1342
1343 // Equivalent of PEI for SGPRs.
1344 addPass(&SILowerSGPRSpillsID);
1345
1346 addPass(createVGPRAllocPass(false));
1347 return true;
1348}
1349
1350bool GCNPassConfig::addRegAssignAndRewriteOptimized() {
1351 if (!usingDefaultRegAlloc())
1352 report_fatal_error(RegAllocOptNotSupportedMessage);
1353
1354 addPass(createSGPRAllocPass(true));
1355
1356 // Commit allocated register changes. This is mostly necessary because too
1357 // many things rely on the use lists of the physical registers, such as the
1358 // verifier. This is only necessary with allocators which use LiveIntervals,
1359 // since FastRegAlloc does the replacements itself.
1360 addPass(createVirtRegRewriter(false));
1361
1362 // Equivalent of PEI for SGPRs.
1363 addPass(&SILowerSGPRSpillsID);
1364
1365 addPass(createVGPRAllocPass(true));
1366
1367 addPreRewrite();
1368 addPass(&VirtRegRewriterID);
1369
1370 return true;
1371}
1372
1373void GCNPassConfig::addPostRegAlloc() {
1374 addPass(&SIFixVGPRCopiesID);
1375 if (getOptLevel() > CodeGenOpt::None)
1376 addPass(&SIOptimizeExecMaskingID);
1377 TargetPassConfig::addPostRegAlloc();
1378}
1379
1380void GCNPassConfig::addPreSched2() {
1381 if (TM->getOptLevel() > CodeGenOpt::None)
1382 addPass(createSIShrinkInstructionsPass());
1383 addPass(&SIPostRABundlerID);
1384}
1385
1386void GCNPassConfig::addPreEmitPass() {
1387 if (isPassEnabled(EnableVOPD, CodeGenOpt::Less))
1388 addPass(&GCNCreateVOPDID);
1389 addPass(createSIMemoryLegalizerPass());
1390 addPass(createSIInsertWaitcntsPass());
1391
1392 addPass(createSIModeRegisterPass());
1393
1394 if (getOptLevel() > CodeGenOpt::None)
1395 addPass(&SIInsertHardClausesID);
1396
1397 addPass(&SILateBranchLoweringPassID);
1398 if (isPassEnabled(EnableSetWavePriority, CodeGenOpt::Less))
1399 addPass(createAMDGPUSetWavePriorityPass());
1400 if (getOptLevel() > CodeGenOpt::None)
1401 addPass(&SIPreEmitPeepholeID);
1402 // The hazard recognizer that runs as part of the post-ra scheduler does not
1403 // guarantee to be able handle all hazards correctly. This is because if there
1404 // are multiple scheduling regions in a basic block, the regions are scheduled
1405 // bottom up, so when we begin to schedule a region we don't know what
1406 // instructions were emitted directly before it.
1407 //
1408 // Here we add a stand-alone hazard recognizer pass which can handle all
1409 // cases.
1410 addPass(&PostRAHazardRecognizerID);
1411
1412 if (getOptLevel() > CodeGenOpt::Less)
1413 addPass(&AMDGPUReleaseVGPRsID);
1414
1415 if (isPassEnabled(EnableInsertDelayAlu, CodeGenOpt::Less))
1416 addPass(&AMDGPUInsertDelayAluID);
1417
1418 addPass(&BranchRelaxationPassID);
1419}
1420
1421TargetPassConfig *GCNTargetMachine::createPassConfig(PassManagerBase &PM) {
1422 return new GCNPassConfig(*this, PM);
1423}
1424
1425MachineFunctionInfo *GCNTargetMachine::createMachineFunctionInfo(
1426 BumpPtrAllocator &Allocator, const Function &F,
1427 const TargetSubtargetInfo *STI) const {
1428 return SIMachineFunctionInfo::create<SIMachineFunctionInfo>(
1429 Allocator, F, static_cast<const GCNSubtarget *>(STI));
1430}
1431
1432yaml::MachineFunctionInfo *GCNTargetMachine::createDefaultFuncInfoYAML() const {
1433 return new yaml::SIMachineFunctionInfo();
1434}
1435
1436yaml::MachineFunctionInfo *
1437GCNTargetMachine::convertFuncInfoToYAML(const MachineFunction &MF) const {
1438 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1439 return new yaml::SIMachineFunctionInfo(
1440 *MFI, *MF.getSubtarget<GCNSubtarget>().getRegisterInfo(), MF);
1441}
1442
1443bool GCNTargetMachine::parseMachineFunctionInfo(
1444 const yaml::MachineFunctionInfo &MFI_, PerFunctionMIParsingState &PFS,
1445 SMDiagnostic &Error, SMRange &SourceRange) const {
1446 const yaml::SIMachineFunctionInfo &YamlMFI =
1447 static_cast<const yaml::SIMachineFunctionInfo &>(MFI_);
1448 MachineFunction &MF = PFS.MF;
1449 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1450
1451 if (MFI->initializeBaseYamlFields(YamlMFI, MF, PFS, Error, SourceRange))
1452 return true;
1453
1454 if (MFI->Occupancy == 0) {
1455 // Fixup the subtarget dependent default value.
1456 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1457 MFI->Occupancy = ST.computeOccupancy(MF.getFunction(), MFI->getLDSSize());
1458 }
1459
1460 auto parseRegister = [&](const yaml::StringValue &RegName, Register &RegVal) {
1461 Register TempReg;
1462 if (parseNamedRegisterReference(PFS, TempReg, RegName.Value, Error)) {
1463 SourceRange = RegName.SourceRange;
1464 return true;
1465 }
1466 RegVal = TempReg;
1467
1468 return false;
1469 };
1470
1471 auto parseOptionalRegister = [&](const yaml::StringValue &RegName,
1472 Register &RegVal) {
1473 return !RegName.Value.empty() && parseRegister(RegName, RegVal);
1474 };
1475
1476 if (parseOptionalRegister(YamlMFI.VGPRForAGPRCopy, MFI->VGPRForAGPRCopy))
1477 return true;
1478
1479 auto diagnoseRegisterClass = [&](const yaml::StringValue &RegName) {
1480 // Create a diagnostic for a the register string literal.
1481 const MemoryBuffer &Buffer =
1482 *PFS.SM->getMemoryBuffer(PFS.SM->getMainFileID());
1483 Error = SMDiagnostic(*PFS.SM, SMLoc(), Buffer.getBufferIdentifier(), 1,
1484 RegName.Value.size(), SourceMgr::DK_Error,
1485 "incorrect register class for field", RegName.Value,
1486 std::nullopt, std::nullopt);
1487 SourceRange = RegName.SourceRange;
1488 return true;
1489 };
1490
1491 if (parseRegister(YamlMFI.ScratchRSrcReg, MFI->ScratchRSrcReg) ||
1492 parseRegister(YamlMFI.FrameOffsetReg, MFI->FrameOffsetReg) ||
1493 parseRegister(YamlMFI.StackPtrOffsetReg, MFI->StackPtrOffsetReg))
1494 return true;
1495
1496 if (MFI->ScratchRSrcReg != AMDGPU::PRIVATE_RSRC_REG &&
1497 !AMDGPU::SGPR_128RegClass.contains(MFI->ScratchRSrcReg)) {
1498 return diagnoseRegisterClass(YamlMFI.ScratchRSrcReg);
1499 }
1500
1501 if (MFI->FrameOffsetReg != AMDGPU::FP_REG &&
1502 !AMDGPU::SGPR_32RegClass.contains(MFI->FrameOffsetReg)) {
1503 return diagnoseRegisterClass(YamlMFI.FrameOffsetReg);
1504 }
1505
1506 if (MFI->StackPtrOffsetReg != AMDGPU::SP_REG &&
1507 !AMDGPU::SGPR_32RegClass.contains(MFI->StackPtrOffsetReg)) {
1508 return diagnoseRegisterClass(YamlMFI.StackPtrOffsetReg);
1509 }
1510
1511 for (const auto &YamlReg : YamlMFI.WWMReservedRegs) {
1512 Register ParsedReg;
1513 if (parseRegister(YamlReg, ParsedReg))
1514 return true;
1515
1516 MFI->reserveWWMRegister(ParsedReg);
1517 }
1518
1519 auto parseAndCheckArgument = [&](const std::optional<yaml::SIArgument> &A,
1520 const TargetRegisterClass &RC,
1521 ArgDescriptor &Arg, unsigned UserSGPRs,
1522 unsigned SystemSGPRs) {
1523 // Skip parsing if it's not present.
1524 if (!A)
1525 return false;
1526
1527 if (A->IsRegister) {
1528 Register Reg;
1529 if (parseNamedRegisterReference(PFS, Reg, A->RegisterName.Value, Error)) {
1530 SourceRange = A->RegisterName.SourceRange;
1531 return true;
1532 }
1533 if (!RC.contains(Reg))
1534 return diagnoseRegisterClass(A->RegisterName);
1535 Arg = ArgDescriptor::createRegister(Reg);
1536 } else
1537 Arg = ArgDescriptor::createStack(A->StackOffset);
1538 // Check and apply the optional mask.
1539 if (A->Mask)
1540 Arg = ArgDescriptor::createArg(Arg, *A->Mask);
1541
1542 MFI->NumUserSGPRs += UserSGPRs;
1543 MFI->NumSystemSGPRs += SystemSGPRs;
1544 return false;
1545 };
1546
1547 if (YamlMFI.ArgInfo &&
1548 (parseAndCheckArgument(YamlMFI.ArgInfo->PrivateSegmentBuffer,
1549 AMDGPU::SGPR_128RegClass,
1550 MFI->ArgInfo.PrivateSegmentBuffer, 4, 0) ||
1551 parseAndCheckArgument(YamlMFI.ArgInfo->DispatchPtr,
1552 AMDGPU::SReg_64RegClass, MFI->ArgInfo.DispatchPtr,
1553 2, 0) ||
1554 parseAndCheckArgument(YamlMFI.ArgInfo->QueuePtr, AMDGPU::SReg_64RegClass,
1555 MFI->ArgInfo.QueuePtr, 2, 0) ||
1556 parseAndCheckArgument(YamlMFI.ArgInfo->KernargSegmentPtr,
1557 AMDGPU::SReg_64RegClass,
1558 MFI->ArgInfo.KernargSegmentPtr, 2, 0) ||
1559 parseAndCheckArgument(YamlMFI.ArgInfo->DispatchID,
1560 AMDGPU::SReg_64RegClass, MFI->ArgInfo.DispatchID,
1561 2, 0) ||
1562 parseAndCheckArgument(YamlMFI.ArgInfo->FlatScratchInit,
1563 AMDGPU::SReg_64RegClass,
1564 MFI->ArgInfo.FlatScratchInit, 2, 0) ||
1565 parseAndCheckArgument(YamlMFI.ArgInfo->PrivateSegmentSize,
1566 AMDGPU::SGPR_32RegClass,
1567 MFI->ArgInfo.PrivateSegmentSize, 0, 0) ||
1568 parseAndCheckArgument(YamlMFI.ArgInfo->LDSKernelId,
1569 AMDGPU::SGPR_32RegClass,
1570 MFI->ArgInfo.LDSKernelId, 0, 1) ||
1571 parseAndCheckArgument(YamlMFI.ArgInfo->WorkGroupIDX,
1572 AMDGPU::SGPR_32RegClass, MFI->ArgInfo.WorkGroupIDX,
1573 0, 1) ||
1574 parseAndCheckArgument(YamlMFI.ArgInfo->WorkGroupIDY,
1575 AMDGPU::SGPR_32RegClass, MFI->ArgInfo.WorkGroupIDY,
1576 0, 1) ||
1577 parseAndCheckArgument(YamlMFI.ArgInfo->WorkGroupIDZ,
1578 AMDGPU::SGPR_32RegClass, MFI->ArgInfo.WorkGroupIDZ,
1579 0, 1) ||
1580 parseAndCheckArgument(YamlMFI.ArgInfo->WorkGroupInfo,
1581 AMDGPU::SGPR_32RegClass,
1582 MFI->ArgInfo.WorkGroupInfo, 0, 1) ||
1583 parseAndCheckArgument(YamlMFI.ArgInfo->PrivateSegmentWaveByteOffset,
1584 AMDGPU::SGPR_32RegClass,
1585 MFI->ArgInfo.PrivateSegmentWaveByteOffset, 0, 1) ||
1586 parseAndCheckArgument(YamlMFI.ArgInfo->ImplicitArgPtr,
1587 AMDGPU::SReg_64RegClass,
1588 MFI->ArgInfo.ImplicitArgPtr, 0, 0) ||
1589 parseAndCheckArgument(YamlMFI.ArgInfo->ImplicitBufferPtr,
1590 AMDGPU::SReg_64RegClass,
1591 MFI->ArgInfo.ImplicitBufferPtr, 2, 0) ||
1592 parseAndCheckArgument(YamlMFI.ArgInfo->WorkItemIDX,
1593 AMDGPU::VGPR_32RegClass,
1594 MFI->ArgInfo.WorkItemIDX, 0, 0) ||
1595 parseAndCheckArgument(YamlMFI.ArgInfo->WorkItemIDY,
1596 AMDGPU::VGPR_32RegClass,
1597 MFI->ArgInfo.WorkItemIDY, 0, 0) ||
1598 parseAndCheckArgument(YamlMFI.ArgInfo->WorkItemIDZ,
1599 AMDGPU::VGPR_32RegClass,
1600 MFI->ArgInfo.WorkItemIDZ, 0, 0)))
1601 return true;
1602
1603 MFI->Mode.IEEE = YamlMFI.Mode.IEEE;
1604 MFI->Mode.DX10Clamp = YamlMFI.Mode.DX10Clamp;
1605
1606 // FIXME: Move proper support for denormal-fp-math into base MachineFunction
1607 MFI->Mode.FP32Denormals.Input = YamlMFI.Mode.FP32InputDenormals
1608 ? DenormalMode::IEEE
1609 : DenormalMode::PreserveSign;
1610 MFI->Mode.FP32Denormals.Output = YamlMFI.Mode.FP32OutputDenormals
1611 ? DenormalMode::IEEE
1612 : DenormalMode::PreserveSign;
1613
1614 MFI->Mode.FP64FP16Denormals.Input = YamlMFI.Mode.FP64FP16InputDenormals
1615 ? DenormalMode::IEEE
1616 : DenormalMode::PreserveSign;
1617 MFI->Mode.FP64FP16Denormals.Output = YamlMFI.Mode.FP64FP16OutputDenormals
1618 ? DenormalMode::IEEE
1619 : DenormalMode::PreserveSign;
1620
1621 return false;
1622}