Bug Summary

File:build/source/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
Warning:line 129, column 5
Value stored to 'Ctor' is never read

Annotated Source Code

Press '?' to see keyboard shortcuts

clang -cc1 -cc1 -triple x86_64-pc-linux-gnu -analyze -disable-free -clear-ast-before-backend -disable-llvm-verifier -discard-value-names -main-file-name AMDGPUTargetMachine.cpp -analyzer-checker=core -analyzer-checker=apiModeling -analyzer-checker=unix -analyzer-checker=deadcode -analyzer-checker=cplusplus -analyzer-checker=security.insecureAPI.UncheckedReturn -analyzer-checker=security.insecureAPI.getpw -analyzer-checker=security.insecureAPI.gets -analyzer-checker=security.insecureAPI.mktemp -analyzer-checker=security.insecureAPI.mkstemp -analyzer-checker=security.insecureAPI.vfork -analyzer-checker=nullability.NullPassedToNonnull -analyzer-checker=nullability.NullReturnedFromNonnull -analyzer-output plist -w -setup-static-analyzer -analyzer-config-compatibility-mode=true -mrelocation-model pic -pic-level 2 -mframe-pointer=none -fmath-errno -ffp-contract=on -fno-rounding-math -mconstructor-aliases -funwind-tables=2 -target-cpu x86-64 -tune-cpu generic -debugger-tuning=gdb -ffunction-sections -fdata-sections -fcoverage-compilation-dir=/build/source/build-llvm/tools/clang/stage2-bins -resource-dir /usr/lib/llvm-17/lib/clang/17 -D _DEBUG -D _GLIBCXX_ASSERTIONS -D _GNU_SOURCE -D _LIBCPP_ENABLE_ASSERTIONS -D __STDC_CONSTANT_MACROS -D __STDC_FORMAT_MACROS -D __STDC_LIMIT_MACROS -I lib/Target/AMDGPU -I /build/source/llvm/lib/Target/AMDGPU -I include -I /build/source/llvm/include -D _FORTIFY_SOURCE=2 -D NDEBUG -U NDEBUG -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../include/c++/10 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../include/x86_64-linux-gnu/c++/10 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../include/c++/10/backward -internal-isystem /usr/lib/llvm-17/lib/clang/17/include -internal-isystem /usr/local/include -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../x86_64-linux-gnu/include -internal-externc-isystem /usr/include/x86_64-linux-gnu -internal-externc-isystem /include -internal-externc-isystem /usr/include -fmacro-prefix-map=/build/source/build-llvm/tools/clang/stage2-bins=build-llvm/tools/clang/stage2-bins -fmacro-prefix-map=/build/source/= -fcoverage-prefix-map=/build/source/build-llvm/tools/clang/stage2-bins=build-llvm/tools/clang/stage2-bins -fcoverage-prefix-map=/build/source/= -source-date-epoch 1683717183 -O2 -Wno-unused-command-line-argument -Wno-unused-parameter -Wwrite-strings -Wno-missing-field-initializers -Wno-long-long -Wno-maybe-uninitialized -Wno-class-memaccess -Wno-redundant-move -Wno-pessimizing-move -Wno-noexcept-type -Wno-comment -Wno-misleading-indentation -std=c++17 -fdeprecated-macro -fdebug-compilation-dir=/build/source/build-llvm/tools/clang/stage2-bins -fdebug-prefix-map=/build/source/build-llvm/tools/clang/stage2-bins=build-llvm/tools/clang/stage2-bins -fdebug-prefix-map=/build/source/= -ferror-limit 19 -fvisibility=hidden -fvisibility-inlines-hidden -stack-protector 2 -fgnuc-version=4.2.1 -fcolor-diagnostics -vectorize-loops -vectorize-slp -analyzer-output=html -analyzer-config stable-report-filename=true -faddrsig -D__GCC_HAVE_DWARF2_CFI_ASM=1 -o /tmp/scan-build-2023-05-10-133810-16478-1 -x c++ /build/source/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
1//===-- AMDGPUTargetMachine.cpp - TargetMachine for hw codegen targets-----===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// The AMDGPU target machine contains all of the hardware specific
11/// information needed to emit code for SI+ GPUs.
12//
13//===----------------------------------------------------------------------===//
14
15#include "AMDGPUTargetMachine.h"
16#include "AMDGPU.h"
17#include "AMDGPUAliasAnalysis.h"
18#include "AMDGPUCtorDtorLowering.h"
19#include "AMDGPUExportClustering.h"
20#include "AMDGPUIGroupLP.h"
21#include "AMDGPUMacroFusion.h"
22#include "AMDGPURegBankSelect.h"
23#include "AMDGPUTargetObjectFile.h"
24#include "AMDGPUTargetTransformInfo.h"
25#include "AMDGPUUnifyDivergentExitNodes.h"
26#include "GCNIterativeScheduler.h"
27#include "GCNSchedStrategy.h"
28#include "GCNVOPDUtils.h"
29#include "R600.h"
30#include "R600MachineFunctionInfo.h"
31#include "R600TargetMachine.h"
32#include "SIMachineFunctionInfo.h"
33#include "SIMachineScheduler.h"
34#include "TargetInfo/AMDGPUTargetInfo.h"
35#include "Utils/AMDGPUBaseInfo.h"
36#include "llvm/Analysis/CGSCCPassManager.h"
37#include "llvm/CodeGen/GlobalISel/CSEInfo.h"
38#include "llvm/CodeGen/GlobalISel/IRTranslator.h"
39#include "llvm/CodeGen/GlobalISel/InstructionSelect.h"
40#include "llvm/CodeGen/GlobalISel/Legalizer.h"
41#include "llvm/CodeGen/GlobalISel/Localizer.h"
42#include "llvm/CodeGen/GlobalISel/RegBankSelect.h"
43#include "llvm/CodeGen/MIRParser/MIParser.h"
44#include "llvm/CodeGen/Passes.h"
45#include "llvm/CodeGen/RegAllocRegistry.h"
46#include "llvm/CodeGen/TargetPassConfig.h"
47#include "llvm/IR/IntrinsicsAMDGPU.h"
48#include "llvm/IR/PassManager.h"
49#include "llvm/IR/PatternMatch.h"
50#include "llvm/InitializePasses.h"
51#include "llvm/MC/TargetRegistry.h"
52#include "llvm/Passes/PassBuilder.h"
53#include "llvm/Transforms/IPO.h"
54#include "llvm/Transforms/IPO/AlwaysInliner.h"
55#include "llvm/Transforms/IPO/GlobalDCE.h"
56#include "llvm/Transforms/IPO/Internalize.h"
57#include "llvm/Transforms/Scalar.h"
58#include "llvm/Transforms/Scalar/GVN.h"
59#include "llvm/Transforms/Scalar/InferAddressSpaces.h"
60#include "llvm/Transforms/Utils.h"
61#include "llvm/Transforms/Utils/SimplifyLibCalls.h"
62#include "llvm/Transforms/Vectorize/LoadStoreVectorizer.h"
63#include <optional>
64
65using namespace llvm;
66using namespace llvm::PatternMatch;
67
68namespace {
69class SGPRRegisterRegAlloc : public RegisterRegAllocBase<SGPRRegisterRegAlloc> {
70public:
71 SGPRRegisterRegAlloc(const char *N, const char *D, FunctionPassCtor C)
72 : RegisterRegAllocBase(N, D, C) {}
73};
74
75class VGPRRegisterRegAlloc : public RegisterRegAllocBase<VGPRRegisterRegAlloc> {
76public:
77 VGPRRegisterRegAlloc(const char *N, const char *D, FunctionPassCtor C)
78 : RegisterRegAllocBase(N, D, C) {}
79};
80
81static bool onlyAllocateSGPRs(const TargetRegisterInfo &TRI,
82 const TargetRegisterClass &RC) {
83 return static_cast<const SIRegisterInfo &>(TRI).isSGPRClass(&RC);
84}
85
86static bool onlyAllocateVGPRs(const TargetRegisterInfo &TRI,
87 const TargetRegisterClass &RC) {
88 return !static_cast<const SIRegisterInfo &>(TRI).isSGPRClass(&RC);
89}
90
91
92/// -{sgpr|vgpr}-regalloc=... command line option.
93static FunctionPass *useDefaultRegisterAllocator() { return nullptr; }
94
95/// A dummy default pass factory indicates whether the register allocator is
96/// overridden on the command line.
97static llvm::once_flag InitializeDefaultSGPRRegisterAllocatorFlag;
98static llvm::once_flag InitializeDefaultVGPRRegisterAllocatorFlag;
99
100static SGPRRegisterRegAlloc
101defaultSGPRRegAlloc("default",
102 "pick SGPR register allocator based on -O option",
103 useDefaultRegisterAllocator);
104
105static cl::opt<SGPRRegisterRegAlloc::FunctionPassCtor, false,
106 RegisterPassParser<SGPRRegisterRegAlloc>>
107SGPRRegAlloc("sgpr-regalloc", cl::Hidden, cl::init(&useDefaultRegisterAllocator),
108 cl::desc("Register allocator to use for SGPRs"));
109
110static cl::opt<VGPRRegisterRegAlloc::FunctionPassCtor, false,
111 RegisterPassParser<VGPRRegisterRegAlloc>>
112VGPRRegAlloc("vgpr-regalloc", cl::Hidden, cl::init(&useDefaultRegisterAllocator),
113 cl::desc("Register allocator to use for VGPRs"));
114
115
116static void initializeDefaultSGPRRegisterAllocatorOnce() {
117 RegisterRegAlloc::FunctionPassCtor Ctor = SGPRRegisterRegAlloc::getDefault();
118
119 if (!Ctor) {
120 Ctor = SGPRRegAlloc;
121 SGPRRegisterRegAlloc::setDefault(SGPRRegAlloc);
122 }
123}
124
125static void initializeDefaultVGPRRegisterAllocatorOnce() {
126 RegisterRegAlloc::FunctionPassCtor Ctor = VGPRRegisterRegAlloc::getDefault();
127
128 if (!Ctor) {
129 Ctor = VGPRRegAlloc;
Value stored to 'Ctor' is never read
130 VGPRRegisterRegAlloc::setDefault(VGPRRegAlloc);
131 }
132}
133
134static FunctionPass *createBasicSGPRRegisterAllocator() {
135 return createBasicRegisterAllocator(onlyAllocateSGPRs);
136}
137
138static FunctionPass *createGreedySGPRRegisterAllocator() {
139 return createGreedyRegisterAllocator(onlyAllocateSGPRs);
140}
141
142static FunctionPass *createFastSGPRRegisterAllocator() {
143 return createFastRegisterAllocator(onlyAllocateSGPRs, false);
144}
145
146static FunctionPass *createBasicVGPRRegisterAllocator() {
147 return createBasicRegisterAllocator(onlyAllocateVGPRs);
148}
149
150static FunctionPass *createGreedyVGPRRegisterAllocator() {
151 return createGreedyRegisterAllocator(onlyAllocateVGPRs);
152}
153
154static FunctionPass *createFastVGPRRegisterAllocator() {
155 return createFastRegisterAllocator(onlyAllocateVGPRs, true);
156}
157
158static SGPRRegisterRegAlloc basicRegAllocSGPR(
159 "basic", "basic register allocator", createBasicSGPRRegisterAllocator);
160static SGPRRegisterRegAlloc greedyRegAllocSGPR(
161 "greedy", "greedy register allocator", createGreedySGPRRegisterAllocator);
162
163static SGPRRegisterRegAlloc fastRegAllocSGPR(
164 "fast", "fast register allocator", createFastSGPRRegisterAllocator);
165
166
167static VGPRRegisterRegAlloc basicRegAllocVGPR(
168 "basic", "basic register allocator", createBasicVGPRRegisterAllocator);
169static VGPRRegisterRegAlloc greedyRegAllocVGPR(
170 "greedy", "greedy register allocator", createGreedyVGPRRegisterAllocator);
171
172static VGPRRegisterRegAlloc fastRegAllocVGPR(
173 "fast", "fast register allocator", createFastVGPRRegisterAllocator);
174}
175
176static cl::opt<bool> EnableSROA(
177 "amdgpu-sroa",
178 cl::desc("Run SROA after promote alloca pass"),
179 cl::ReallyHidden,
180 cl::init(true));
181
182static cl::opt<bool>
183EnableEarlyIfConversion("amdgpu-early-ifcvt", cl::Hidden,
184 cl::desc("Run early if-conversion"),
185 cl::init(false));
186
187static cl::opt<bool>
188OptExecMaskPreRA("amdgpu-opt-exec-mask-pre-ra", cl::Hidden,
189 cl::desc("Run pre-RA exec mask optimizations"),
190 cl::init(true));
191
192// Option to disable vectorizer for tests.
193static cl::opt<bool> EnableLoadStoreVectorizer(
194 "amdgpu-load-store-vectorizer",
195 cl::desc("Enable load store vectorizer"),
196 cl::init(true),
197 cl::Hidden);
198
199// Option to control global loads scalarization
200static cl::opt<bool> ScalarizeGlobal(
201 "amdgpu-scalarize-global-loads",
202 cl::desc("Enable global load scalarization"),
203 cl::init(true),
204 cl::Hidden);
205
206// Option to run internalize pass.
207static cl::opt<bool> InternalizeSymbols(
208 "amdgpu-internalize-symbols",
209 cl::desc("Enable elimination of non-kernel functions and unused globals"),
210 cl::init(false),
211 cl::Hidden);
212
213// Option to inline all early.
214static cl::opt<bool> EarlyInlineAll(
215 "amdgpu-early-inline-all",
216 cl::desc("Inline all functions early"),
217 cl::init(false),
218 cl::Hidden);
219
220static cl::opt<bool> RemoveIncompatibleFunctions(
221 "amdgpu-enable-remove-incompatible-functions", cl::Hidden,
222 cl::desc("Enable removal of functions when they"
223 "use features not supported by the target GPU"),
224 cl::init(true));
225
226static cl::opt<bool> EnableSDWAPeephole(
227 "amdgpu-sdwa-peephole",
228 cl::desc("Enable SDWA peepholer"),
229 cl::init(true));
230
231static cl::opt<bool> EnableDPPCombine(
232 "amdgpu-dpp-combine",
233 cl::desc("Enable DPP combiner"),
234 cl::init(true));
235
236// Enable address space based alias analysis
237static cl::opt<bool> EnableAMDGPUAliasAnalysis("enable-amdgpu-aa", cl::Hidden,
238 cl::desc("Enable AMDGPU Alias Analysis"),
239 cl::init(true));
240
241// Option to run late CFG structurizer
242static cl::opt<bool, true> LateCFGStructurize(
243 "amdgpu-late-structurize",
244 cl::desc("Enable late CFG structurization"),
245 cl::location(AMDGPUTargetMachine::EnableLateStructurizeCFG),
246 cl::Hidden);
247
248// Enable lib calls simplifications
249static cl::opt<bool> EnableLibCallSimplify(
250 "amdgpu-simplify-libcall",
251 cl::desc("Enable amdgpu library simplifications"),
252 cl::init(true),
253 cl::Hidden);
254
255static cl::opt<bool> EnableLowerKernelArguments(
256 "amdgpu-ir-lower-kernel-arguments",
257 cl::desc("Lower kernel argument loads in IR pass"),
258 cl::init(true),
259 cl::Hidden);
260
261static cl::opt<bool> EnableRegReassign(
262 "amdgpu-reassign-regs",
263 cl::desc("Enable register reassign optimizations on gfx10+"),
264 cl::init(true),
265 cl::Hidden);
266
267static cl::opt<bool> OptVGPRLiveRange(
268 "amdgpu-opt-vgpr-liverange",
269 cl::desc("Enable VGPR liverange optimizations for if-else structure"),
270 cl::init(true), cl::Hidden);
271
272// Enable atomic optimization
273static cl::opt<bool> EnableAtomicOptimizations(
274 "amdgpu-atomic-optimizations",
275 cl::desc("Enable atomic optimizations"),
276 cl::init(false),
277 cl::Hidden);
278
279// Enable Mode register optimization
280static cl::opt<bool> EnableSIModeRegisterPass(
281 "amdgpu-mode-register",
282 cl::desc("Enable mode register pass"),
283 cl::init(true),
284 cl::Hidden);
285
286// Enable GFX11+ s_delay_alu insertion
287static cl::opt<bool>
288 EnableInsertDelayAlu("amdgpu-enable-delay-alu",
289 cl::desc("Enable s_delay_alu insertion"),
290 cl::init(true), cl::Hidden);
291
292// Enable GFX11+ VOPD
293static cl::opt<bool>
294 EnableVOPD("amdgpu-enable-vopd",
295 cl::desc("Enable VOPD, dual issue of VALU in wave32"),
296 cl::init(true), cl::Hidden);
297
298// Option is used in lit tests to prevent deadcoding of patterns inspected.
299static cl::opt<bool>
300EnableDCEInRA("amdgpu-dce-in-ra",
301 cl::init(true), cl::Hidden,
302 cl::desc("Enable machine DCE inside regalloc"));
303
304static cl::opt<bool> EnableSetWavePriority("amdgpu-set-wave-priority",
305 cl::desc("Adjust wave priority"),
306 cl::init(false), cl::Hidden);
307
308static cl::opt<bool> EnableScalarIRPasses(
309 "amdgpu-scalar-ir-passes",
310 cl::desc("Enable scalar IR passes"),
311 cl::init(true),
312 cl::Hidden);
313
314static cl::opt<bool> EnableStructurizerWorkarounds(
315 "amdgpu-enable-structurizer-workarounds",
316 cl::desc("Enable workarounds for the StructurizeCFG pass"), cl::init(true),
317 cl::Hidden);
318
319static cl::opt<bool, true> EnableLowerModuleLDS(
320 "amdgpu-enable-lower-module-lds", cl::desc("Enable lower module lds pass"),
321 cl::location(AMDGPUTargetMachine::EnableLowerModuleLDS), cl::init(true),
322 cl::Hidden);
323
324static cl::opt<bool> EnablePreRAOptimizations(
325 "amdgpu-enable-pre-ra-optimizations",
326 cl::desc("Enable Pre-RA optimizations pass"), cl::init(true),
327 cl::Hidden);
328
329static cl::opt<bool> EnablePromoteKernelArguments(
330 "amdgpu-enable-promote-kernel-arguments",
331 cl::desc("Enable promotion of flat kernel pointer arguments to global"),
332 cl::Hidden, cl::init(true));
333
334static cl::opt<bool> EnableMaxIlpSchedStrategy(
335 "amdgpu-enable-max-ilp-scheduling-strategy",
336 cl::desc("Enable scheduling strategy to maximize ILP for a single wave."),
337 cl::Hidden, cl::init(false));
338
339extern "C" LLVM_EXTERNAL_VISIBILITY__attribute__((visibility("default"))) void LLVMInitializeAMDGPUTarget() {
340 // Register the target
341 RegisterTargetMachine<R600TargetMachine> X(getTheAMDGPUTarget());
342 RegisterTargetMachine<GCNTargetMachine> Y(getTheGCNTarget());
343
344 PassRegistry *PR = PassRegistry::getPassRegistry();
345 initializeR600ClauseMergePassPass(*PR);
346 initializeR600ControlFlowFinalizerPass(*PR);
347 initializeR600PacketizerPass(*PR);
348 initializeR600ExpandSpecialInstrsPassPass(*PR);
349 initializeR600VectorRegMergerPass(*PR);
350 initializeGlobalISel(*PR);
351 initializeAMDGPUDAGToDAGISelPass(*PR);
352 initializeGCNDPPCombinePass(*PR);
353 initializeSILowerI1CopiesPass(*PR);
354 initializeSILowerSGPRSpillsPass(*PR);
355 initializeSIFixSGPRCopiesPass(*PR);
356 initializeSIFixVGPRCopiesPass(*PR);
357 initializeSIFoldOperandsPass(*PR);
358 initializeSIPeepholeSDWAPass(*PR);
359 initializeSIShrinkInstructionsPass(*PR);
360 initializeSIOptimizeExecMaskingPreRAPass(*PR);
361 initializeSIOptimizeVGPRLiveRangePass(*PR);
362 initializeSILoadStoreOptimizerPass(*PR);
363 initializeAMDGPUCtorDtorLoweringLegacyPass(*PR);
364 initializeAMDGPUAlwaysInlinePass(*PR);
365 initializeAMDGPUAttributorPass(*PR);
366 initializeAMDGPUAnnotateKernelFeaturesPass(*PR);
367 initializeAMDGPUAnnotateUniformValuesPass(*PR);
368 initializeAMDGPUArgumentUsageInfoPass(*PR);
369 initializeAMDGPUAtomicOptimizerPass(*PR);
370 initializeAMDGPULowerKernelArgumentsPass(*PR);
371 initializeAMDGPUPromoteKernelArgumentsPass(*PR);
372 initializeAMDGPULowerKernelAttributesPass(*PR);
373 initializeAMDGPULowerIntrinsicsPass(*PR);
374 initializeAMDGPUOpenCLEnqueuedBlockLoweringPass(*PR);
375 initializeAMDGPUPostLegalizerCombinerPass(*PR);
376 initializeAMDGPUPreLegalizerCombinerPass(*PR);
377 initializeAMDGPURegBankCombinerPass(*PR);
378 initializeAMDGPURegBankSelectPass(*PR);
379 initializeAMDGPUPromoteAllocaPass(*PR);
380 initializeAMDGPUPromoteAllocaToVectorPass(*PR);
381 initializeAMDGPUCodeGenPreparePass(*PR);
382 initializeAMDGPULateCodeGenPreparePass(*PR);
383 initializeAMDGPUPropagateAttributesEarlyPass(*PR);
384 initializeAMDGPUPropagateAttributesLatePass(*PR);
385 initializeAMDGPURemoveIncompatibleFunctionsPass(*PR);
386 initializeAMDGPULowerModuleLDSPass(*PR);
387 initializeAMDGPURewriteOutArgumentsPass(*PR);
388 initializeAMDGPURewriteUndefForPHIPass(*PR);
389 initializeAMDGPUUnifyMetadataPass(*PR);
390 initializeSIAnnotateControlFlowPass(*PR);
391 initializeAMDGPUReleaseVGPRsPass(*PR);
392 initializeAMDGPUInsertDelayAluPass(*PR);
393 initializeSIInsertHardClausesPass(*PR);
394 initializeSIInsertWaitcntsPass(*PR);
395 initializeSIModeRegisterPass(*PR);
396 initializeSIWholeQuadModePass(*PR);
397 initializeSILowerControlFlowPass(*PR);
398 initializeSIPreEmitPeepholePass(*PR);
399 initializeSILateBranchLoweringPass(*PR);
400 initializeSIMemoryLegalizerPass(*PR);
401 initializeSIOptimizeExecMaskingPass(*PR);
402 initializeSIPreAllocateWWMRegsPass(*PR);
403 initializeSIFormMemoryClausesPass(*PR);
404 initializeSIPostRABundlerPass(*PR);
405 initializeGCNCreateVOPDPass(*PR);
406 initializeAMDGPUUnifyDivergentExitNodesPass(*PR);
407 initializeAMDGPUAAWrapperPassPass(*PR);
408 initializeAMDGPUExternalAAWrapperPass(*PR);
409 initializeAMDGPUUseNativeCallsPass(*PR);
410 initializeAMDGPUSimplifyLibCallsPass(*PR);
411 initializeAMDGPUPrintfRuntimeBindingPass(*PR);
412 initializeAMDGPUResourceUsageAnalysisPass(*PR);
413 initializeGCNNSAReassignPass(*PR);
414 initializeGCNPreRAOptimizationsPass(*PR);
415}
416
417static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) {
418 return std::make_unique<AMDGPUTargetObjectFile>();
419}
420
421static ScheduleDAGInstrs *createSIMachineScheduler(MachineSchedContext *C) {
422 return new SIScheduleDAGMI(C);
423}
424
425static ScheduleDAGInstrs *
426createGCNMaxOccupancyMachineScheduler(MachineSchedContext *C) {
427 const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>();
428 ScheduleDAGMILive *DAG =
429 new GCNScheduleDAGMILive(C, std::make_unique<GCNMaxOccupancySchedStrategy>(C));
430 DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
431 if (ST.shouldClusterStores())
432 DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
433 DAG->addMutation(createIGroupLPDAGMutation());
434 DAG->addMutation(createAMDGPUMacroFusionDAGMutation());
435 DAG->addMutation(createAMDGPUExportClusteringDAGMutation());
436 return DAG;
437}
438
439static ScheduleDAGInstrs *
440createGCNMaxILPMachineScheduler(MachineSchedContext *C) {
441 ScheduleDAGMILive *DAG =
442 new GCNScheduleDAGMILive(C, std::make_unique<GCNMaxILPSchedStrategy>(C));
443 DAG->addMutation(createIGroupLPDAGMutation());
444 return DAG;
445}
446
447static ScheduleDAGInstrs *
448createIterativeGCNMaxOccupancyMachineScheduler(MachineSchedContext *C) {
449 const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>();
450 auto DAG = new GCNIterativeScheduler(C,
451 GCNIterativeScheduler::SCHEDULE_LEGACYMAXOCCUPANCY);
452 DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
453 if (ST.shouldClusterStores())
454 DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
455 return DAG;
456}
457
458static ScheduleDAGInstrs *createMinRegScheduler(MachineSchedContext *C) {
459 return new GCNIterativeScheduler(C,
460 GCNIterativeScheduler::SCHEDULE_MINREGFORCED);
461}
462
463static ScheduleDAGInstrs *
464createIterativeILPMachineScheduler(MachineSchedContext *C) {
465 const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>();
466 auto DAG = new GCNIterativeScheduler(C,
467 GCNIterativeScheduler::SCHEDULE_ILP);
468 DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
469 if (ST.shouldClusterStores())
470 DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
471 DAG->addMutation(createAMDGPUMacroFusionDAGMutation());
472 return DAG;
473}
474
475static MachineSchedRegistry
476SISchedRegistry("si", "Run SI's custom scheduler",
477 createSIMachineScheduler);
478
479static MachineSchedRegistry
480GCNMaxOccupancySchedRegistry("gcn-max-occupancy",
481 "Run GCN scheduler to maximize occupancy",
482 createGCNMaxOccupancyMachineScheduler);
483
484static MachineSchedRegistry
485 GCNMaxILPSchedRegistry("gcn-max-ilp", "Run GCN scheduler to maximize ilp",
486 createGCNMaxILPMachineScheduler);
487
488static MachineSchedRegistry IterativeGCNMaxOccupancySchedRegistry(
489 "gcn-iterative-max-occupancy-experimental",
490 "Run GCN scheduler to maximize occupancy (experimental)",
491 createIterativeGCNMaxOccupancyMachineScheduler);
492
493static MachineSchedRegistry GCNMinRegSchedRegistry(
494 "gcn-iterative-minreg",
495 "Run GCN iterative scheduler for minimal register usage (experimental)",
496 createMinRegScheduler);
497
498static MachineSchedRegistry GCNILPSchedRegistry(
499 "gcn-iterative-ilp",
500 "Run GCN iterative scheduler for ILP scheduling (experimental)",
501 createIterativeILPMachineScheduler);
502
503static StringRef computeDataLayout(const Triple &TT) {
504 if (TT.getArch() == Triple::r600) {
505 // 32-bit pointers.
506 return "e-p:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128"
507 "-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1";
508 }
509
510 // 32-bit private, local, and region pointers. 64-bit global, constant and
511 // flat. 160-bit non-integral fat buffer pointers that include a 128-bit
512 // buffer descriptor and a 32-bit offset, which are indexed by 32-bit values
513 // (address space 7), and 128-bit non-integral buffer resourcees (address
514 // space 8) which cannot be non-trivilally accessed by LLVM memory operations
515 // like getelementptr.
516 return "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32"
517 "-p7:160:256:256:32-p8:128:128-i64:64-v16:16-v24:32-v32:32-v48:64-v96:"
518 "128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-"
519 "G1-ni:7:8";
520}
521
522LLVM_READNONE__attribute__((__const__))
523static StringRef getGPUOrDefault(const Triple &TT, StringRef GPU) {
524 if (!GPU.empty())
525 return GPU;
526
527 // Need to default to a target with flat support for HSA.
528 if (TT.getArch() == Triple::amdgcn)
529 return TT.getOS() == Triple::AMDHSA ? "generic-hsa" : "generic";
530
531 return "r600";
532}
533
534static Reloc::Model getEffectiveRelocModel(std::optional<Reloc::Model> RM) {
535 // The AMDGPU toolchain only supports generating shared objects, so we
536 // must always use PIC.
537 return Reloc::PIC_;
538}
539
540AMDGPUTargetMachine::AMDGPUTargetMachine(const Target &T, const Triple &TT,
541 StringRef CPU, StringRef FS,
542 TargetOptions Options,
543 std::optional<Reloc::Model> RM,
544 std::optional<CodeModel::Model> CM,
545 CodeGenOpt::Level OptLevel)
546 : LLVMTargetMachine(T, computeDataLayout(TT), TT, getGPUOrDefault(TT, CPU),
547 FS, Options, getEffectiveRelocModel(RM),
548 getEffectiveCodeModel(CM, CodeModel::Small), OptLevel),
549 TLOF(createTLOF(getTargetTriple())) {
550 initAsmInfo();
551 if (TT.getArch() == Triple::amdgcn) {
552 if (getMCSubtargetInfo()->checkFeatures("+wavefrontsize64"))
553 MRI.reset(llvm::createGCNMCRegisterInfo(AMDGPUDwarfFlavour::Wave64));
554 else if (getMCSubtargetInfo()->checkFeatures("+wavefrontsize32"))
555 MRI.reset(llvm::createGCNMCRegisterInfo(AMDGPUDwarfFlavour::Wave32));
556 }
557}
558
559bool AMDGPUTargetMachine::EnableLateStructurizeCFG = false;
560bool AMDGPUTargetMachine::EnableFunctionCalls = false;
561bool AMDGPUTargetMachine::EnableLowerModuleLDS = true;
562
563AMDGPUTargetMachine::~AMDGPUTargetMachine() = default;
564
565StringRef AMDGPUTargetMachine::getGPUName(const Function &F) const {
566 Attribute GPUAttr = F.getFnAttribute("target-cpu");
567 return GPUAttr.isValid() ? GPUAttr.getValueAsString() : getTargetCPU();
568}
569
570StringRef AMDGPUTargetMachine::getFeatureString(const Function &F) const {
571 Attribute FSAttr = F.getFnAttribute("target-features");
572
573 return FSAttr.isValid() ? FSAttr.getValueAsString()
574 : getTargetFeatureString();
575}
576
577/// Predicate for Internalize pass.
578static bool mustPreserveGV(const GlobalValue &GV) {
579 if (const Function *F = dyn_cast<Function>(&GV))
580 return F->isDeclaration() || F->getName().startswith("__asan_") ||
581 F->getName().startswith("__sanitizer_") ||
582 AMDGPU::isEntryFunctionCC(F->getCallingConv());
583
584 GV.removeDeadConstantUsers();
585 return !GV.use_empty();
586}
587
588void AMDGPUTargetMachine::registerDefaultAliasAnalyses(AAManager &AAM) {
589 AAM.registerFunctionAnalysis<AMDGPUAA>();
590}
591
592void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) {
593 PB.registerPipelineParsingCallback(
594 [this](StringRef PassName, ModulePassManager &PM,
595 ArrayRef<PassBuilder::PipelineElement>) {
596 if (PassName == "amdgpu-propagate-attributes-late") {
597 PM.addPass(AMDGPUPropagateAttributesLatePass(*this));
598 return true;
599 }
600 if (PassName == "amdgpu-unify-metadata") {
601 PM.addPass(AMDGPUUnifyMetadataPass());
602 return true;
603 }
604 if (PassName == "amdgpu-printf-runtime-binding") {
605 PM.addPass(AMDGPUPrintfRuntimeBindingPass());
606 return true;
607 }
608 if (PassName == "amdgpu-always-inline") {
609 PM.addPass(AMDGPUAlwaysInlinePass());
610 return true;
611 }
612 if (PassName == "amdgpu-lower-module-lds") {
613 PM.addPass(AMDGPULowerModuleLDSPass());
614 return true;
615 }
616 if (PassName == "amdgpu-lower-ctor-dtor") {
617 PM.addPass(AMDGPUCtorDtorLoweringPass());
618 return true;
619 }
620 return false;
621 });
622 PB.registerPipelineParsingCallback(
623 [this](StringRef PassName, FunctionPassManager &PM,
624 ArrayRef<PassBuilder::PipelineElement>) {
625 if (PassName == "amdgpu-simplifylib") {
626 PM.addPass(AMDGPUSimplifyLibCallsPass(*this));
627 return true;
628 }
629 if (PassName == "amdgpu-usenative") {
630 PM.addPass(AMDGPUUseNativeCallsPass());
631 return true;
632 }
633 if (PassName == "amdgpu-promote-alloca") {
634 PM.addPass(AMDGPUPromoteAllocaPass(*this));
635 return true;
636 }
637 if (PassName == "amdgpu-promote-alloca-to-vector") {
638 PM.addPass(AMDGPUPromoteAllocaToVectorPass(*this));
639 return true;
640 }
641 if (PassName == "amdgpu-lower-kernel-attributes") {
642 PM.addPass(AMDGPULowerKernelAttributesPass());
643 return true;
644 }
645 if (PassName == "amdgpu-propagate-attributes-early") {
646 PM.addPass(AMDGPUPropagateAttributesEarlyPass(*this));
647 return true;
648 }
649 if (PassName == "amdgpu-promote-kernel-arguments") {
650 PM.addPass(AMDGPUPromoteKernelArgumentsPass());
651 return true;
652 }
653 if (PassName == "amdgpu-unify-divergent-exit-nodes") {
654 PM.addPass(AMDGPUUnifyDivergentExitNodesPass());
655 return true;
656 }
657 if (PassName == "amdgpu-atomic-optimizer") {
658 PM.addPass(AMDGPUAtomicOptimizerPass(*this));
659 return true;
660 }
661 return false;
662 });
663
664 PB.registerAnalysisRegistrationCallback([](FunctionAnalysisManager &FAM) {
665 FAM.registerPass([&] { return AMDGPUAA(); });
666 });
667
668 PB.registerParseAACallback([](StringRef AAName, AAManager &AAM) {
669 if (AAName == "amdgpu-aa") {
670 AAM.registerFunctionAnalysis<AMDGPUAA>();
671 return true;
672 }
673 return false;
674 });
675
676 PB.registerPipelineStartEPCallback(
677 [this](ModulePassManager &PM, OptimizationLevel Level) {
678 FunctionPassManager FPM;
679 FPM.addPass(AMDGPUPropagateAttributesEarlyPass(*this));
680 FPM.addPass(AMDGPUUseNativeCallsPass());
681 if (EnableLibCallSimplify && Level != OptimizationLevel::O0)
682 FPM.addPass(AMDGPUSimplifyLibCallsPass(*this));
683 PM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM)));
684 });
685
686 PB.registerPipelineEarlySimplificationEPCallback(
687 [this](ModulePassManager &PM, OptimizationLevel Level) {
688 PM.addPass(AMDGPUPrintfRuntimeBindingPass());
689
690 if (Level == OptimizationLevel::O0)
691 return;
692
693 PM.addPass(AMDGPUUnifyMetadataPass());
694
695 if (InternalizeSymbols) {
696 PM.addPass(InternalizePass(mustPreserveGV));
697 }
698 PM.addPass(AMDGPUPropagateAttributesLatePass(*this));
699 if (InternalizeSymbols) {
700 PM.addPass(GlobalDCEPass());
701 }
702 if (EarlyInlineAll && !EnableFunctionCalls)
703 PM.addPass(AMDGPUAlwaysInlinePass());
704 });
705
706 PB.registerCGSCCOptimizerLateEPCallback(
707 [this](CGSCCPassManager &PM, OptimizationLevel Level) {
708 if (Level == OptimizationLevel::O0)
709 return;
710
711 FunctionPassManager FPM;
712
713 // Add promote kernel arguments pass to the opt pipeline right before
714 // infer address spaces which is needed to do actual address space
715 // rewriting.
716 if (Level.getSpeedupLevel() > OptimizationLevel::O1.getSpeedupLevel() &&
717 EnablePromoteKernelArguments)
718 FPM.addPass(AMDGPUPromoteKernelArgumentsPass());
719
720 // Add infer address spaces pass to the opt pipeline after inlining
721 // but before SROA to increase SROA opportunities.
722 FPM.addPass(InferAddressSpacesPass());
723
724 // This should run after inlining to have any chance of doing
725 // anything, and before other cleanup optimizations.
726 FPM.addPass(AMDGPULowerKernelAttributesPass());
727
728 if (Level != OptimizationLevel::O0) {
729 // Promote alloca to vector before SROA and loop unroll. If we
730 // manage to eliminate allocas before unroll we may choose to unroll
731 // less.
732 FPM.addPass(AMDGPUPromoteAllocaToVectorPass(*this));
733 }
734
735 PM.addPass(createCGSCCToFunctionPassAdaptor(std::move(FPM)));
736 });
737}
738
739int64_t AMDGPUTargetMachine::getNullPointerValue(unsigned AddrSpace) {
740 return (AddrSpace == AMDGPUAS::LOCAL_ADDRESS ||
741 AddrSpace == AMDGPUAS::PRIVATE_ADDRESS ||
742 AddrSpace == AMDGPUAS::REGION_ADDRESS)
743 ? -1
744 : 0;
745}
746
747bool AMDGPUTargetMachine::isNoopAddrSpaceCast(unsigned SrcAS,
748 unsigned DestAS) const {
749 return AMDGPU::isFlatGlobalAddrSpace(SrcAS) &&
750 AMDGPU::isFlatGlobalAddrSpace(DestAS);
751}
752
753unsigned AMDGPUTargetMachine::getAssumedAddrSpace(const Value *V) const {
754 const auto *LD = dyn_cast<LoadInst>(V);
755 if (!LD)
756 return AMDGPUAS::UNKNOWN_ADDRESS_SPACE;
757
758 // It must be a generic pointer loaded.
759 assert(V->getType()->isPointerTy() &&(static_cast <bool> (V->getType()->isPointerTy() &&
V->getType()->getPointerAddressSpace() == AMDGPUAS::FLAT_ADDRESS
) ? void (0) : __assert_fail ("V->getType()->isPointerTy() && V->getType()->getPointerAddressSpace() == AMDGPUAS::FLAT_ADDRESS"
, "llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp", 760, __extension__
__PRETTY_FUNCTION__))
760 V->getType()->getPointerAddressSpace() == AMDGPUAS::FLAT_ADDRESS)(static_cast <bool> (V->getType()->isPointerTy() &&
V->getType()->getPointerAddressSpace() == AMDGPUAS::FLAT_ADDRESS
) ? void (0) : __assert_fail ("V->getType()->isPointerTy() && V->getType()->getPointerAddressSpace() == AMDGPUAS::FLAT_ADDRESS"
, "llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp", 760, __extension__
__PRETTY_FUNCTION__))
;
761
762 const auto *Ptr = LD->getPointerOperand();
763 if (Ptr->getType()->getPointerAddressSpace() != AMDGPUAS::CONSTANT_ADDRESS)
764 return AMDGPUAS::UNKNOWN_ADDRESS_SPACE;
765 // For a generic pointer loaded from the constant memory, it could be assumed
766 // as a global pointer since the constant memory is only populated on the
767 // host side. As implied by the offload programming model, only global
768 // pointers could be referenced on the host side.
769 return AMDGPUAS::GLOBAL_ADDRESS;
770}
771
772std::pair<const Value *, unsigned>
773AMDGPUTargetMachine::getPredicatedAddrSpace(const Value *V) const {
774 if (auto *II = dyn_cast<IntrinsicInst>(V)) {
775 switch (II->getIntrinsicID()) {
776 case Intrinsic::amdgcn_is_shared:
777 return std::pair(II->getArgOperand(0), AMDGPUAS::LOCAL_ADDRESS);
778 case Intrinsic::amdgcn_is_private:
779 return std::pair(II->getArgOperand(0), AMDGPUAS::PRIVATE_ADDRESS);
780 default:
781 break;
782 }
783 return std::pair(nullptr, -1);
784 }
785 // Check the global pointer predication based on
786 // (!is_share(p) && !is_private(p)). Note that logic 'and' is commutative and
787 // the order of 'is_shared' and 'is_private' is not significant.
788 Value *Ptr;
789 if (match(
790 const_cast<Value *>(V),
791 m_c_And(m_Not(m_Intrinsic<Intrinsic::amdgcn_is_shared>(m_Value(Ptr))),
792 m_Not(m_Intrinsic<Intrinsic::amdgcn_is_private>(
793 m_Deferred(Ptr))))))
794 return std::pair(Ptr, AMDGPUAS::GLOBAL_ADDRESS);
795
796 return std::pair(nullptr, -1);
797}
798
799unsigned
800AMDGPUTargetMachine::getAddressSpaceForPseudoSourceKind(unsigned Kind) const {
801 switch (Kind) {
802 case PseudoSourceValue::Stack:
803 case PseudoSourceValue::FixedStack:
804 return AMDGPUAS::PRIVATE_ADDRESS;
805 case PseudoSourceValue::ConstantPool:
806 case PseudoSourceValue::GOT:
807 case PseudoSourceValue::JumpTable:
808 case PseudoSourceValue::GlobalValueCallEntry:
809 case PseudoSourceValue::ExternalSymbolCallEntry:
810 return AMDGPUAS::CONSTANT_ADDRESS;
811 }
812 return AMDGPUAS::FLAT_ADDRESS;
813}
814
815//===----------------------------------------------------------------------===//
816// GCN Target Machine (SI+)
817//===----------------------------------------------------------------------===//
818
819GCNTargetMachine::GCNTargetMachine(const Target &T, const Triple &TT,
820 StringRef CPU, StringRef FS,
821 TargetOptions Options,
822 std::optional<Reloc::Model> RM,
823 std::optional<CodeModel::Model> CM,
824 CodeGenOpt::Level OL, bool JIT)
825 : AMDGPUTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL) {}
826
827const TargetSubtargetInfo *
828GCNTargetMachine::getSubtargetImpl(const Function &F) const {
829 StringRef GPU = getGPUName(F);
830 StringRef FS = getFeatureString(F);
831
832 SmallString<128> SubtargetKey(GPU);
833 SubtargetKey.append(FS);
834
835 auto &I = SubtargetMap[SubtargetKey];
836 if (!I) {
837 // This needs to be done before we create a new subtarget since any
838 // creation will depend on the TM and the code generation flags on the
839 // function that reside in TargetOptions.
840 resetTargetOptions(F);
841 I = std::make_unique<GCNSubtarget>(TargetTriple, GPU, FS, *this);
842 }
843
844 I->setScalarizeGlobalBehavior(ScalarizeGlobal);
845
846 return I.get();
847}
848
849TargetTransformInfo
850GCNTargetMachine::getTargetTransformInfo(const Function &F) const {
851 return TargetTransformInfo(GCNTTIImpl(this, F));
852}
853
854//===----------------------------------------------------------------------===//
855// AMDGPU Pass Setup
856//===----------------------------------------------------------------------===//
857
858std::unique_ptr<CSEConfigBase> llvm::AMDGPUPassConfig::getCSEConfig() const {
859 return getStandardCSEConfigForOpt(TM->getOptLevel());
860}
861
862namespace {
863
864class GCNPassConfig final : public AMDGPUPassConfig {
865public:
866 GCNPassConfig(LLVMTargetMachine &TM, PassManagerBase &PM)
867 : AMDGPUPassConfig(TM, PM) {
868 // It is necessary to know the register usage of the entire call graph. We
869 // allow calls without EnableAMDGPUFunctionCalls if they are marked
870 // noinline, so this is always required.
871 setRequiresCodeGenSCCOrder(true);
872 substitutePass(&PostRASchedulerID, &PostMachineSchedulerID);
873 }
874
875 GCNTargetMachine &getGCNTargetMachine() const {
876 return getTM<GCNTargetMachine>();
877 }
878
879 ScheduleDAGInstrs *
880 createMachineScheduler(MachineSchedContext *C) const override;
881
882 ScheduleDAGInstrs *
883 createPostMachineScheduler(MachineSchedContext *C) const override {
884 ScheduleDAGMI *DAG = new GCNPostScheduleDAGMILive(
885 C, std::make_unique<PostGenericScheduler>(C),
886 /*RemoveKillFlags=*/true);
887 const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>();
888 DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
889 if (ST.shouldClusterStores())
890 DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
891 DAG->addMutation(ST.createFillMFMAShadowMutation(DAG->TII));
892 DAG->addMutation(createIGroupLPDAGMutation());
893 if (isPassEnabled(EnableVOPD, CodeGenOpt::Less))
894 DAG->addMutation(createVOPDPairingMutation());
895 return DAG;
896 }
897
898 bool addPreISel() override;
899 void addMachineSSAOptimization() override;
900 bool addILPOpts() override;
901 bool addInstSelector() override;
902 bool addIRTranslator() override;
903 void addPreLegalizeMachineIR() override;
904 bool addLegalizeMachineIR() override;
905 void addPreRegBankSelect() override;
906 bool addRegBankSelect() override;
907 void addPreGlobalInstructionSelect() override;
908 bool addGlobalInstructionSelect() override;
909 void addFastRegAlloc() override;
910 void addOptimizedRegAlloc() override;
911
912 FunctionPass *createSGPRAllocPass(bool Optimized);
913 FunctionPass *createVGPRAllocPass(bool Optimized);
914 FunctionPass *createRegAllocPass(bool Optimized) override;
915
916 bool addRegAssignAndRewriteFast() override;
917 bool addRegAssignAndRewriteOptimized() override;
918
919 void addPreRegAlloc() override;
920 bool addPreRewrite() override;
921 void addPostRegAlloc() override;
922 void addPreSched2() override;
923 void addPreEmitPass() override;
924};
925
926} // end anonymous namespace
927
928AMDGPUPassConfig::AMDGPUPassConfig(LLVMTargetMachine &TM, PassManagerBase &PM)
929 : TargetPassConfig(TM, PM) {
930 // Exceptions and StackMaps are not supported, so these passes will never do
931 // anything.
932 disablePass(&StackMapLivenessID);
933 disablePass(&FuncletLayoutID);
934 // Garbage collection is not supported.
935 disablePass(&GCLoweringID);
936 disablePass(&ShadowStackGCLoweringID);
937}
938
939void AMDGPUPassConfig::addEarlyCSEOrGVNPass() {
940 if (getOptLevel() == CodeGenOpt::Aggressive)
941 addPass(createGVNPass());
942 else
943 addPass(createEarlyCSEPass());
944}
945
946void AMDGPUPassConfig::addStraightLineScalarOptimizationPasses() {
947 addPass(createLICMPass());
948 addPass(createSeparateConstOffsetFromGEPPass());
949 // ReassociateGEPs exposes more opportunities for SLSR. See
950 // the example in reassociate-geps-and-slsr.ll.
951 addPass(createStraightLineStrengthReducePass());
952 // SeparateConstOffsetFromGEP and SLSR creates common expressions which GVN or
953 // EarlyCSE can reuse.
954 addEarlyCSEOrGVNPass();
955 // Run NaryReassociate after EarlyCSE/GVN to be more effective.
956 addPass(createNaryReassociatePass());
957 // NaryReassociate on GEPs creates redundant common expressions, so run
958 // EarlyCSE after it.
959 addPass(createEarlyCSEPass());
960}
961
962void AMDGPUPassConfig::addIRPasses() {
963 const AMDGPUTargetMachine &TM = getAMDGPUTargetMachine();
964
965 // There is no reason to run these.
966 disablePass(&StackMapLivenessID);
967 disablePass(&FuncletLayoutID);
968 disablePass(&PatchableFunctionID);
969
970 addPass(createAMDGPUPrintfRuntimeBinding());
971 addPass(createAMDGPUCtorDtorLoweringLegacyPass());
972
973 // A call to propagate attributes pass in the backend in case opt was not run.
974 addPass(createAMDGPUPropagateAttributesEarlyPass(&TM));
975
976 addPass(createAMDGPULowerIntrinsicsPass());
977
978 // Function calls are not supported, so make sure we inline everything.
979 addPass(createAMDGPUAlwaysInlinePass());
980 addPass(createAlwaysInlinerLegacyPass());
981
982 // Handle uses of OpenCL image2d_t, image3d_t and sampler_t arguments.
983 if (TM.getTargetTriple().getArch() == Triple::r600)
984 addPass(createR600OpenCLImageTypeLoweringPass());
985
986 // Replace OpenCL enqueued block function pointers with global variables.
987 addPass(createAMDGPUOpenCLEnqueuedBlockLoweringPass());
988
989 // Runs before PromoteAlloca so the latter can account for function uses
990 if (EnableLowerModuleLDS) {
991 addPass(createAMDGPULowerModuleLDSPass());
992 }
993
994 if (TM.getOptLevel() > CodeGenOpt::None)
995 addPass(createInferAddressSpacesPass());
996
997 addPass(createAtomicExpandPass());
998
999 if (TM.getOptLevel() > CodeGenOpt::None) {
1000 addPass(createAMDGPUPromoteAlloca());
1001
1002 if (EnableSROA)
1003 addPass(createSROAPass());
1004 if (isPassEnabled(EnableScalarIRPasses))
1005 addStraightLineScalarOptimizationPasses();
1006
1007 if (EnableAMDGPUAliasAnalysis) {
1008 addPass(createAMDGPUAAWrapperPass());
1009 addPass(createExternalAAWrapperPass([](Pass &P, Function &,
1010 AAResults &AAR) {
1011 if (auto *WrapperPass = P.getAnalysisIfAvailable<AMDGPUAAWrapperPass>())
1012 AAR.addAAResult(WrapperPass->getResult());
1013 }));
1014 }
1015
1016 if (TM.getTargetTriple().getArch() == Triple::amdgcn) {
1017 // TODO: May want to move later or split into an early and late one.
1018 addPass(createAMDGPUCodeGenPreparePass());
1019 }
1020 }
1021
1022 TargetPassConfig::addIRPasses();
1023
1024 // EarlyCSE is not always strong enough to clean up what LSR produces. For
1025 // example, GVN can combine
1026 //
1027 // %0 = add %a, %b
1028 // %1 = add %b, %a
1029 //
1030 // and
1031 //
1032 // %0 = shl nsw %a, 2
1033 // %1 = shl %a, 2
1034 //
1035 // but EarlyCSE can do neither of them.
1036 if (isPassEnabled(EnableScalarIRPasses))
1037 addEarlyCSEOrGVNPass();
1038}
1039
1040void AMDGPUPassConfig::addCodeGenPrepare() {
1041 if (TM->getTargetTriple().getArch() == Triple::amdgcn) {
1042 if (RemoveIncompatibleFunctions)
1043 addPass(createAMDGPURemoveIncompatibleFunctionsPass(TM));
1044
1045 addPass(createAMDGPUAttributorPass());
1046
1047 // FIXME: This pass adds 2 hacky attributes that can be replaced with an
1048 // analysis, and should be removed.
1049 addPass(createAMDGPUAnnotateKernelFeaturesPass());
1050 }
1051
1052 if (TM->getTargetTriple().getArch() == Triple::amdgcn &&
1053 EnableLowerKernelArguments)
1054 addPass(createAMDGPULowerKernelArgumentsPass());
1055
1056 TargetPassConfig::addCodeGenPrepare();
1057
1058 if (isPassEnabled(EnableLoadStoreVectorizer))
1059 addPass(createLoadStoreVectorizerPass());
1060
1061 // LowerSwitch pass may introduce unreachable blocks that can
1062 // cause unexpected behavior for subsequent passes. Placing it
1063 // here seems better that these blocks would get cleaned up by
1064 // UnreachableBlockElim inserted next in the pass flow.
1065 addPass(createLowerSwitchPass());
1066}
1067
1068bool AMDGPUPassConfig::addPreISel() {
1069 if (TM->getOptLevel() > CodeGenOpt::None)
1070 addPass(createFlattenCFGPass());
1071 return false;
1072}
1073
1074bool AMDGPUPassConfig::addInstSelector() {
1075 addPass(createAMDGPUISelDag(getAMDGPUTargetMachine(), getOptLevel()));
1076 return false;
1077}
1078
1079bool AMDGPUPassConfig::addGCPasses() {
1080 // Do nothing. GC is not supported.
1081 return false;
1082}
1083
1084llvm::ScheduleDAGInstrs *
1085AMDGPUPassConfig::createMachineScheduler(MachineSchedContext *C) const {
1086 const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>();
1087 ScheduleDAGMILive *DAG = createGenericSchedLive(C);
1088 DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
1089 if (ST.shouldClusterStores())
1090 DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
1091 return DAG;
1092}
1093
1094MachineFunctionInfo *R600TargetMachine::createMachineFunctionInfo(
1095 BumpPtrAllocator &Allocator, const Function &F,
1096 const TargetSubtargetInfo *STI) const {
1097 return R600MachineFunctionInfo::create<R600MachineFunctionInfo>(
1098 Allocator, F, static_cast<const R600Subtarget *>(STI));
1099}
1100
1101//===----------------------------------------------------------------------===//
1102// GCN Pass Setup
1103//===----------------------------------------------------------------------===//
1104
1105ScheduleDAGInstrs *GCNPassConfig::createMachineScheduler(
1106 MachineSchedContext *C) const {
1107 const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>();
1108 if (ST.enableSIScheduler())
1109 return createSIMachineScheduler(C);
1110
1111 if (EnableMaxIlpSchedStrategy)
1112 return createGCNMaxILPMachineScheduler(C);
1113
1114 return createGCNMaxOccupancyMachineScheduler(C);
1115}
1116
1117bool GCNPassConfig::addPreISel() {
1118 AMDGPUPassConfig::addPreISel();
1119
1120 if (TM->getOptLevel() > CodeGenOpt::None)
1121 addPass(createAMDGPULateCodeGenPreparePass());
1122
1123 if (isPassEnabled(EnableAtomicOptimizations, CodeGenOpt::Less)) {
1124 addPass(createAMDGPUAtomicOptimizerPass());
1125 }
1126
1127 if (TM->getOptLevel() > CodeGenOpt::None)
1128 addPass(createSinkingPass());
1129
1130 // Merge divergent exit nodes. StructurizeCFG won't recognize the multi-exit
1131 // regions formed by them.
1132 addPass(&AMDGPUUnifyDivergentExitNodesID);
1133 if (!LateCFGStructurize) {
1134 if (EnableStructurizerWorkarounds) {
1135 addPass(createFixIrreduciblePass());
1136 addPass(createUnifyLoopExitsPass());
1137 }
1138 addPass(createStructurizeCFGPass(false)); // true -> SkipUniformRegions
1139 }
1140 addPass(createAMDGPUAnnotateUniformValues());
1141 if (!LateCFGStructurize) {
1142 addPass(createSIAnnotateControlFlowPass());
1143 // TODO: Move this right after structurizeCFG to avoid extra divergence
1144 // analysis. This depends on stopping SIAnnotateControlFlow from making
1145 // control flow modifications.
1146 addPass(createAMDGPURewriteUndefForPHIPass());
1147 }
1148 addPass(createLCSSAPass());
1149
1150 if (TM->getOptLevel() > CodeGenOpt::Less)
1151 addPass(&AMDGPUPerfHintAnalysisID);
1152
1153 return false;
1154}
1155
1156void GCNPassConfig::addMachineSSAOptimization() {
1157 TargetPassConfig::addMachineSSAOptimization();
1158
1159 // We want to fold operands after PeepholeOptimizer has run (or as part of
1160 // it), because it will eliminate extra copies making it easier to fold the
1161 // real source operand. We want to eliminate dead instructions after, so that
1162 // we see fewer uses of the copies. We then need to clean up the dead
1163 // instructions leftover after the operands are folded as well.
1164 //
1165 // XXX - Can we get away without running DeadMachineInstructionElim again?
1166 addPass(&SIFoldOperandsID);
1167 if (EnableDPPCombine)
1168 addPass(&GCNDPPCombineID);
1169 addPass(&SILoadStoreOptimizerID);
1170 if (isPassEnabled(EnableSDWAPeephole)) {
1171 addPass(&SIPeepholeSDWAID);
1172 addPass(&EarlyMachineLICMID);
1173 addPass(&MachineCSEID);
1174 addPass(&SIFoldOperandsID);
1175 }
1176 addPass(&DeadMachineInstructionElimID);
1177 addPass(createSIShrinkInstructionsPass());
1178}
1179
1180bool GCNPassConfig::addILPOpts() {
1181 if (EnableEarlyIfConversion)
1182 addPass(&EarlyIfConverterID);
1183
1184 TargetPassConfig::addILPOpts();
1185 return false;
1186}
1187
1188bool GCNPassConfig::addInstSelector() {
1189 AMDGPUPassConfig::addInstSelector();
1190 addPass(&SIFixSGPRCopiesID);
1191 addPass(createSILowerI1CopiesPass());
1192 return false;
1193}
1194
1195bool GCNPassConfig::addIRTranslator() {
1196 addPass(new IRTranslator(getOptLevel()));
1197 return false;
1198}
1199
1200void GCNPassConfig::addPreLegalizeMachineIR() {
1201 bool IsOptNone = getOptLevel() == CodeGenOpt::None;
1202 addPass(createAMDGPUPreLegalizeCombiner(IsOptNone));
1203 addPass(new Localizer());
1204}
1205
1206bool GCNPassConfig::addLegalizeMachineIR() {
1207 addPass(new Legalizer());
1208 return false;
1209}
1210
1211void GCNPassConfig::addPreRegBankSelect() {
1212 bool IsOptNone = getOptLevel() == CodeGenOpt::None;
1213 addPass(createAMDGPUPostLegalizeCombiner(IsOptNone));
1214}
1215
1216bool GCNPassConfig::addRegBankSelect() {
1217 addPass(new AMDGPURegBankSelect());
1218 return false;
1219}
1220
1221void GCNPassConfig::addPreGlobalInstructionSelect() {
1222 bool IsOptNone = getOptLevel() == CodeGenOpt::None;
1223 addPass(createAMDGPURegBankCombiner(IsOptNone));
1224}
1225
1226bool GCNPassConfig::addGlobalInstructionSelect() {
1227 addPass(new InstructionSelect(getOptLevel()));
1228 return false;
1229}
1230
1231void GCNPassConfig::addPreRegAlloc() {
1232 if (LateCFGStructurize) {
1233 addPass(createAMDGPUMachineCFGStructurizerPass());
1234 }
1235}
1236
1237void GCNPassConfig::addFastRegAlloc() {
1238 // FIXME: We have to disable the verifier here because of PHIElimination +
1239 // TwoAddressInstructions disabling it.
1240
1241 // This must be run immediately after phi elimination and before
1242 // TwoAddressInstructions, otherwise the processing of the tied operand of
1243 // SI_ELSE will introduce a copy of the tied operand source after the else.
1244 insertPass(&PHIEliminationID, &SILowerControlFlowID);
1245
1246 insertPass(&TwoAddressInstructionPassID, &SIWholeQuadModeID);
1247 insertPass(&TwoAddressInstructionPassID, &SIPreAllocateWWMRegsID);
1248
1249 TargetPassConfig::addFastRegAlloc();
1250}
1251
1252void GCNPassConfig::addOptimizedRegAlloc() {
1253 // Allow the scheduler to run before SIWholeQuadMode inserts exec manipulation
1254 // instructions that cause scheduling barriers.
1255 insertPass(&MachineSchedulerID, &SIWholeQuadModeID);
1256 insertPass(&MachineSchedulerID, &SIPreAllocateWWMRegsID);
1257
1258 if (OptExecMaskPreRA)
1259 insertPass(&MachineSchedulerID, &SIOptimizeExecMaskingPreRAID);
1260
1261 if (isPassEnabled(EnablePreRAOptimizations))
1262 insertPass(&RenameIndependentSubregsID, &GCNPreRAOptimizationsID);
1263
1264 // This is not an essential optimization and it has a noticeable impact on
1265 // compilation time, so we only enable it from O2.
1266 if (TM->getOptLevel() > CodeGenOpt::Less)
1267 insertPass(&MachineSchedulerID, &SIFormMemoryClausesID);
1268
1269 // FIXME: when an instruction has a Killed operand, and the instruction is
1270 // inside a bundle, seems only the BUNDLE instruction appears as the Kills of
1271 // the register in LiveVariables, this would trigger a failure in verifier,
1272 // we should fix it and enable the verifier.
1273 if (OptVGPRLiveRange)
1274 insertPass(&LiveVariablesID, &SIOptimizeVGPRLiveRangeID);
1275 // This must be run immediately after phi elimination and before
1276 // TwoAddressInstructions, otherwise the processing of the tied operand of
1277 // SI_ELSE will introduce a copy of the tied operand source after the else.
1278 insertPass(&PHIEliminationID, &SILowerControlFlowID);
1279
1280 if (EnableDCEInRA)
1281 insertPass(&DetectDeadLanesID, &DeadMachineInstructionElimID);
1282
1283 TargetPassConfig::addOptimizedRegAlloc();
1284}
1285
1286bool GCNPassConfig::addPreRewrite() {
1287 if (EnableRegReassign)
1288 addPass(&GCNNSAReassignID);
1289 return true;
1290}
1291
1292FunctionPass *GCNPassConfig::createSGPRAllocPass(bool Optimized) {
1293 // Initialize the global default.
1294 llvm::call_once(InitializeDefaultSGPRRegisterAllocatorFlag,
1295 initializeDefaultSGPRRegisterAllocatorOnce);
1296
1297 RegisterRegAlloc::FunctionPassCtor Ctor = SGPRRegisterRegAlloc::getDefault();
1298 if (Ctor != useDefaultRegisterAllocator)
1299 return Ctor();
1300
1301 if (Optimized)
1302 return createGreedyRegisterAllocator(onlyAllocateSGPRs);
1303
1304 return createFastRegisterAllocator(onlyAllocateSGPRs, false);
1305}
1306
1307FunctionPass *GCNPassConfig::createVGPRAllocPass(bool Optimized) {
1308 // Initialize the global default.
1309 llvm::call_once(InitializeDefaultVGPRRegisterAllocatorFlag,
1310 initializeDefaultVGPRRegisterAllocatorOnce);
1311
1312 RegisterRegAlloc::FunctionPassCtor Ctor = VGPRRegisterRegAlloc::getDefault();
1313 if (Ctor != useDefaultRegisterAllocator)
1314 return Ctor();
1315
1316 if (Optimized)
1317 return createGreedyVGPRRegisterAllocator();
1318
1319 return createFastVGPRRegisterAllocator();
1320}
1321
1322FunctionPass *GCNPassConfig::createRegAllocPass(bool Optimized) {
1323 llvm_unreachable("should not be used")::llvm::llvm_unreachable_internal("should not be used", "llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp"
, 1323)
;
1324}
1325
1326static const char RegAllocOptNotSupportedMessage[] =
1327 "-regalloc not supported with amdgcn. Use -sgpr-regalloc and -vgpr-regalloc";
1328
1329bool GCNPassConfig::addRegAssignAndRewriteFast() {
1330 if (!usingDefaultRegAlloc())
1331 report_fatal_error(RegAllocOptNotSupportedMessage);
1332
1333 addPass(createSGPRAllocPass(false));
1334
1335 // Equivalent of PEI for SGPRs.
1336 addPass(&SILowerSGPRSpillsID);
1337
1338 addPass(createVGPRAllocPass(false));
1339 return true;
1340}
1341
1342bool GCNPassConfig::addRegAssignAndRewriteOptimized() {
1343 if (!usingDefaultRegAlloc())
1344 report_fatal_error(RegAllocOptNotSupportedMessage);
1345
1346 addPass(createSGPRAllocPass(true));
1347
1348 // Commit allocated register changes. This is mostly necessary because too
1349 // many things rely on the use lists of the physical registers, such as the
1350 // verifier. This is only necessary with allocators which use LiveIntervals,
1351 // since FastRegAlloc does the replacements itself.
1352 addPass(createVirtRegRewriter(false));
1353
1354 // Equivalent of PEI for SGPRs.
1355 addPass(&SILowerSGPRSpillsID);
1356
1357 addPass(createVGPRAllocPass(true));
1358
1359 addPreRewrite();
1360 addPass(&VirtRegRewriterID);
1361
1362 return true;
1363}
1364
1365void GCNPassConfig::addPostRegAlloc() {
1366 addPass(&SIFixVGPRCopiesID);
1367 if (getOptLevel() > CodeGenOpt::None)
1368 addPass(&SIOptimizeExecMaskingID);
1369 TargetPassConfig::addPostRegAlloc();
1370}
1371
1372void GCNPassConfig::addPreSched2() {
1373 if (TM->getOptLevel() > CodeGenOpt::None)
1374 addPass(createSIShrinkInstructionsPass());
1375 addPass(&SIPostRABundlerID);
1376}
1377
1378void GCNPassConfig::addPreEmitPass() {
1379 if (isPassEnabled(EnableVOPD, CodeGenOpt::Less))
1380 addPass(&GCNCreateVOPDID);
1381 addPass(createSIMemoryLegalizerPass());
1382 addPass(createSIInsertWaitcntsPass());
1383
1384 addPass(createSIModeRegisterPass());
1385
1386 if (getOptLevel() > CodeGenOpt::None)
1387 addPass(&SIInsertHardClausesID);
1388
1389 addPass(&SILateBranchLoweringPassID);
1390 if (isPassEnabled(EnableSetWavePriority, CodeGenOpt::Less))
1391 addPass(createAMDGPUSetWavePriorityPass());
1392 if (getOptLevel() > CodeGenOpt::None)
1393 addPass(&SIPreEmitPeepholeID);
1394 // The hazard recognizer that runs as part of the post-ra scheduler does not
1395 // guarantee to be able handle all hazards correctly. This is because if there
1396 // are multiple scheduling regions in a basic block, the regions are scheduled
1397 // bottom up, so when we begin to schedule a region we don't know what
1398 // instructions were emitted directly before it.
1399 //
1400 // Here we add a stand-alone hazard recognizer pass which can handle all
1401 // cases.
1402 addPass(&PostRAHazardRecognizerID);
1403
1404 if (getOptLevel() > CodeGenOpt::Less)
1405 addPass(&AMDGPUReleaseVGPRsID);
1406
1407 if (isPassEnabled(EnableInsertDelayAlu, CodeGenOpt::Less))
1408 addPass(&AMDGPUInsertDelayAluID);
1409
1410 addPass(&BranchRelaxationPassID);
1411}
1412
1413TargetPassConfig *GCNTargetMachine::createPassConfig(PassManagerBase &PM) {
1414 return new GCNPassConfig(*this, PM);
1415}
1416
1417MachineFunctionInfo *GCNTargetMachine::createMachineFunctionInfo(
1418 BumpPtrAllocator &Allocator, const Function &F,
1419 const TargetSubtargetInfo *STI) const {
1420 return SIMachineFunctionInfo::create<SIMachineFunctionInfo>(
1421 Allocator, F, static_cast<const GCNSubtarget *>(STI));
1422}
1423
1424yaml::MachineFunctionInfo *GCNTargetMachine::createDefaultFuncInfoYAML() const {
1425 return new yaml::SIMachineFunctionInfo();
1426}
1427
1428yaml::MachineFunctionInfo *
1429GCNTargetMachine::convertFuncInfoToYAML(const MachineFunction &MF) const {
1430 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1431 return new yaml::SIMachineFunctionInfo(
1432 *MFI, *MF.getSubtarget<GCNSubtarget>().getRegisterInfo(), MF);
1433}
1434
1435bool GCNTargetMachine::parseMachineFunctionInfo(
1436 const yaml::MachineFunctionInfo &MFI_, PerFunctionMIParsingState &PFS,
1437 SMDiagnostic &Error, SMRange &SourceRange) const {
1438 const yaml::SIMachineFunctionInfo &YamlMFI =
1439 static_cast<const yaml::SIMachineFunctionInfo &>(MFI_);
1440 MachineFunction &MF = PFS.MF;
1441 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1442
1443 if (MFI->initializeBaseYamlFields(YamlMFI, MF, PFS, Error, SourceRange))
1444 return true;
1445
1446 if (MFI->Occupancy == 0) {
1447 // Fixup the subtarget dependent default value.
1448 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1449 MFI->Occupancy = ST.computeOccupancy(MF.getFunction(), MFI->getLDSSize());
1450 }
1451
1452 auto parseRegister = [&](const yaml::StringValue &RegName, Register &RegVal) {
1453 Register TempReg;
1454 if (parseNamedRegisterReference(PFS, TempReg, RegName.Value, Error)) {
1455 SourceRange = RegName.SourceRange;
1456 return true;
1457 }
1458 RegVal = TempReg;
1459
1460 return false;
1461 };
1462
1463 auto parseOptionalRegister = [&](const yaml::StringValue &RegName,
1464 Register &RegVal) {
1465 return !RegName.Value.empty() && parseRegister(RegName, RegVal);
1466 };
1467
1468 if (parseOptionalRegister(YamlMFI.VGPRForAGPRCopy, MFI->VGPRForAGPRCopy))
1469 return true;
1470
1471 auto diagnoseRegisterClass = [&](const yaml::StringValue &RegName) {
1472 // Create a diagnostic for a the register string literal.
1473 const MemoryBuffer &Buffer =
1474 *PFS.SM->getMemoryBuffer(PFS.SM->getMainFileID());
1475 Error = SMDiagnostic(*PFS.SM, SMLoc(), Buffer.getBufferIdentifier(), 1,
1476 RegName.Value.size(), SourceMgr::DK_Error,
1477 "incorrect register class for field", RegName.Value,
1478 std::nullopt, std::nullopt);
1479 SourceRange = RegName.SourceRange;
1480 return true;
1481 };
1482
1483 if (parseRegister(YamlMFI.ScratchRSrcReg, MFI->ScratchRSrcReg) ||
1484 parseRegister(YamlMFI.FrameOffsetReg, MFI->FrameOffsetReg) ||
1485 parseRegister(YamlMFI.StackPtrOffsetReg, MFI->StackPtrOffsetReg))
1486 return true;
1487
1488 if (MFI->ScratchRSrcReg != AMDGPU::PRIVATE_RSRC_REG &&
1489 !AMDGPU::SGPR_128RegClass.contains(MFI->ScratchRSrcReg)) {
1490 return diagnoseRegisterClass(YamlMFI.ScratchRSrcReg);
1491 }
1492
1493 if (MFI->FrameOffsetReg != AMDGPU::FP_REG &&
1494 !AMDGPU::SGPR_32RegClass.contains(MFI->FrameOffsetReg)) {
1495 return diagnoseRegisterClass(YamlMFI.FrameOffsetReg);
1496 }
1497
1498 if (MFI->StackPtrOffsetReg != AMDGPU::SP_REG &&
1499 !AMDGPU::SGPR_32RegClass.contains(MFI->StackPtrOffsetReg)) {
1500 return diagnoseRegisterClass(YamlMFI.StackPtrOffsetReg);
1501 }
1502
1503 for (const auto &YamlReg : YamlMFI.WWMReservedRegs) {
1504 Register ParsedReg;
1505 if (parseRegister(YamlReg, ParsedReg))
1506 return true;
1507
1508 MFI->reserveWWMRegister(ParsedReg);
1509 }
1510
1511 auto parseAndCheckArgument = [&](const std::optional<yaml::SIArgument> &A,
1512 const TargetRegisterClass &RC,
1513 ArgDescriptor &Arg, unsigned UserSGPRs,
1514 unsigned SystemSGPRs) {
1515 // Skip parsing if it's not present.
1516 if (!A)
1517 return false;
1518
1519 if (A->IsRegister) {
1520 Register Reg;
1521 if (parseNamedRegisterReference(PFS, Reg, A->RegisterName.Value, Error)) {
1522 SourceRange = A->RegisterName.SourceRange;
1523 return true;
1524 }
1525 if (!RC.contains(Reg))
1526 return diagnoseRegisterClass(A->RegisterName);
1527 Arg = ArgDescriptor::createRegister(Reg);
1528 } else
1529 Arg = ArgDescriptor::createStack(A->StackOffset);
1530 // Check and apply the optional mask.
1531 if (A->Mask)
1532 Arg = ArgDescriptor::createArg(Arg, *A->Mask);
1533
1534 MFI->NumUserSGPRs += UserSGPRs;
1535 MFI->NumSystemSGPRs += SystemSGPRs;
1536 return false;
1537 };
1538
1539 if (YamlMFI.ArgInfo &&
1540 (parseAndCheckArgument(YamlMFI.ArgInfo->PrivateSegmentBuffer,
1541 AMDGPU::SGPR_128RegClass,
1542 MFI->ArgInfo.PrivateSegmentBuffer, 4, 0) ||
1543 parseAndCheckArgument(YamlMFI.ArgInfo->DispatchPtr,
1544 AMDGPU::SReg_64RegClass, MFI->ArgInfo.DispatchPtr,
1545 2, 0) ||
1546 parseAndCheckArgument(YamlMFI.ArgInfo->QueuePtr, AMDGPU::SReg_64RegClass,
1547 MFI->ArgInfo.QueuePtr, 2, 0) ||
1548 parseAndCheckArgument(YamlMFI.ArgInfo->KernargSegmentPtr,
1549 AMDGPU::SReg_64RegClass,
1550 MFI->ArgInfo.KernargSegmentPtr, 2, 0) ||
1551 parseAndCheckArgument(YamlMFI.ArgInfo->DispatchID,
1552 AMDGPU::SReg_64RegClass, MFI->ArgInfo.DispatchID,
1553 2, 0) ||
1554 parseAndCheckArgument(YamlMFI.ArgInfo->FlatScratchInit,
1555 AMDGPU::SReg_64RegClass,
1556 MFI->ArgInfo.FlatScratchInit, 2, 0) ||
1557 parseAndCheckArgument(YamlMFI.ArgInfo->PrivateSegmentSize,
1558 AMDGPU::SGPR_32RegClass,
1559 MFI->ArgInfo.PrivateSegmentSize, 0, 0) ||
1560 parseAndCheckArgument(YamlMFI.ArgInfo->LDSKernelId,
1561 AMDGPU::SGPR_32RegClass,
1562 MFI->ArgInfo.LDSKernelId, 0, 1) ||
1563 parseAndCheckArgument(YamlMFI.ArgInfo->WorkGroupIDX,
1564 AMDGPU::SGPR_32RegClass, MFI->ArgInfo.WorkGroupIDX,
1565 0, 1) ||
1566 parseAndCheckArgument(YamlMFI.ArgInfo->WorkGroupIDY,
1567 AMDGPU::SGPR_32RegClass, MFI->ArgInfo.WorkGroupIDY,
1568 0, 1) ||
1569 parseAndCheckArgument(YamlMFI.ArgInfo->WorkGroupIDZ,
1570 AMDGPU::SGPR_32RegClass, MFI->ArgInfo.WorkGroupIDZ,
1571 0, 1) ||
1572 parseAndCheckArgument(YamlMFI.ArgInfo->WorkGroupInfo,
1573 AMDGPU::SGPR_32RegClass,
1574 MFI->ArgInfo.WorkGroupInfo, 0, 1) ||
1575 parseAndCheckArgument(YamlMFI.ArgInfo->PrivateSegmentWaveByteOffset,
1576 AMDGPU::SGPR_32RegClass,
1577 MFI->ArgInfo.PrivateSegmentWaveByteOffset, 0, 1) ||
1578 parseAndCheckArgument(YamlMFI.ArgInfo->ImplicitArgPtr,
1579 AMDGPU::SReg_64RegClass,
1580 MFI->ArgInfo.ImplicitArgPtr, 0, 0) ||
1581 parseAndCheckArgument(YamlMFI.ArgInfo->ImplicitBufferPtr,
1582 AMDGPU::SReg_64RegClass,
1583 MFI->ArgInfo.ImplicitBufferPtr, 2, 0) ||
1584 parseAndCheckArgument(YamlMFI.ArgInfo->WorkItemIDX,
1585 AMDGPU::VGPR_32RegClass,
1586 MFI->ArgInfo.WorkItemIDX, 0, 0) ||
1587 parseAndCheckArgument(YamlMFI.ArgInfo->WorkItemIDY,
1588 AMDGPU::VGPR_32RegClass,
1589 MFI->ArgInfo.WorkItemIDY, 0, 0) ||
1590 parseAndCheckArgument(YamlMFI.ArgInfo->WorkItemIDZ,
1591 AMDGPU::VGPR_32RegClass,
1592 MFI->ArgInfo.WorkItemIDZ, 0, 0)))
1593 return true;
1594
1595 MFI->Mode.IEEE = YamlMFI.Mode.IEEE;
1596 MFI->Mode.DX10Clamp = YamlMFI.Mode.DX10Clamp;
1597
1598 // FIXME: Move proper support for denormal-fp-math into base MachineFunction
1599 MFI->Mode.FP32Denormals.Input = YamlMFI.Mode.FP32InputDenormals
1600 ? DenormalMode::IEEE
1601 : DenormalMode::PreserveSign;
1602 MFI->Mode.FP32Denormals.Output = YamlMFI.Mode.FP32OutputDenormals
1603 ? DenormalMode::IEEE
1604 : DenormalMode::PreserveSign;
1605
1606 MFI->Mode.FP64FP16Denormals.Input = YamlMFI.Mode.FP64FP16InputDenormals
1607 ? DenormalMode::IEEE
1608 : DenormalMode::PreserveSign;
1609 MFI->Mode.FP64FP16Denormals.Output = YamlMFI.Mode.FP64FP16OutputDenormals
1610 ? DenormalMode::IEEE
1611 : DenormalMode::PreserveSign;
1612
1613 return false;
1614}