Bug Summary

File:build/llvm-toolchain-snapshot-16~++20221003111214+1fa2019828ca/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
Warning:line 117, column 5
Value stored to 'Ctor' is never read

Annotated Source Code

Press '?' to see keyboard shortcuts

clang -cc1 -cc1 -triple x86_64-pc-linux-gnu -analyze -disable-free -clear-ast-before-backend -disable-llvm-verifier -discard-value-names -main-file-name AMDGPUTargetMachine.cpp -analyzer-checker=core -analyzer-checker=apiModeling -analyzer-checker=unix -analyzer-checker=deadcode -analyzer-checker=cplusplus -analyzer-checker=security.insecureAPI.UncheckedReturn -analyzer-checker=security.insecureAPI.getpw -analyzer-checker=security.insecureAPI.gets -analyzer-checker=security.insecureAPI.mktemp -analyzer-checker=security.insecureAPI.mkstemp -analyzer-checker=security.insecureAPI.vfork -analyzer-checker=nullability.NullPassedToNonnull -analyzer-checker=nullability.NullReturnedFromNonnull -analyzer-output plist -w -setup-static-analyzer -analyzer-config-compatibility-mode=true -mrelocation-model pic -pic-level 2 -mframe-pointer=none -fmath-errno -ffp-contract=on -fno-rounding-math -mconstructor-aliases -funwind-tables=2 -target-cpu x86-64 -tune-cpu generic -debugger-tuning=gdb -ffunction-sections -fdata-sections -fcoverage-compilation-dir=/build/llvm-toolchain-snapshot-16~++20221003111214+1fa2019828ca/build-llvm/tools/clang/stage2-bins -resource-dir /usr/lib/llvm-16/lib/clang/16.0.0 -D _DEBUG -D _GNU_SOURCE -D __STDC_CONSTANT_MACROS -D __STDC_FORMAT_MACROS -D __STDC_LIMIT_MACROS -I lib/Target/AMDGPU -I /build/llvm-toolchain-snapshot-16~++20221003111214+1fa2019828ca/llvm/lib/Target/AMDGPU -I include -I /build/llvm-toolchain-snapshot-16~++20221003111214+1fa2019828ca/llvm/include -D _FORTIFY_SOURCE=2 -D NDEBUG -U NDEBUG -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../include/c++/10 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../include/x86_64-linux-gnu/c++/10 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../include/c++/10/backward -internal-isystem /usr/lib/llvm-16/lib/clang/16.0.0/include -internal-isystem /usr/local/include -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../x86_64-linux-gnu/include -internal-externc-isystem /usr/include/x86_64-linux-gnu -internal-externc-isystem /include -internal-externc-isystem /usr/include -fmacro-prefix-map=/build/llvm-toolchain-snapshot-16~++20221003111214+1fa2019828ca/build-llvm/tools/clang/stage2-bins=build-llvm/tools/clang/stage2-bins -fmacro-prefix-map=/build/llvm-toolchain-snapshot-16~++20221003111214+1fa2019828ca/= -fcoverage-prefix-map=/build/llvm-toolchain-snapshot-16~++20221003111214+1fa2019828ca/build-llvm/tools/clang/stage2-bins=build-llvm/tools/clang/stage2-bins -fcoverage-prefix-map=/build/llvm-toolchain-snapshot-16~++20221003111214+1fa2019828ca/= -O2 -Wno-unused-command-line-argument -Wno-unused-parameter -Wwrite-strings -Wno-missing-field-initializers -Wno-long-long -Wno-maybe-uninitialized -Wno-class-memaccess -Wno-redundant-move -Wno-pessimizing-move -Wno-noexcept-type -Wno-comment -Wno-misleading-indentation -std=c++17 -fdeprecated-macro -fdebug-compilation-dir=/build/llvm-toolchain-snapshot-16~++20221003111214+1fa2019828ca/build-llvm/tools/clang/stage2-bins -fdebug-prefix-map=/build/llvm-toolchain-snapshot-16~++20221003111214+1fa2019828ca/build-llvm/tools/clang/stage2-bins=build-llvm/tools/clang/stage2-bins -fdebug-prefix-map=/build/llvm-toolchain-snapshot-16~++20221003111214+1fa2019828ca/= -ferror-limit 19 -fvisibility=hidden -fvisibility-inlines-hidden -stack-protector 2 -fgnuc-version=4.2.1 -fcolor-diagnostics -vectorize-loops -vectorize-slp -analyzer-output=html -analyzer-config stable-report-filename=true -faddrsig -D__GCC_HAVE_DWARF2_CFI_ASM=1 -o /tmp/scan-build-2022-10-03-140002-15933-1 -x c++ /build/llvm-toolchain-snapshot-16~++20221003111214+1fa2019828ca/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
1//===-- AMDGPUTargetMachine.cpp - TargetMachine for hw codegen targets-----===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// The AMDGPU target machine contains all of the hardware specific
11/// information needed to emit code for SI+ GPUs.
12//
13//===----------------------------------------------------------------------===//
14
15#include "AMDGPUTargetMachine.h"
16#include "AMDGPU.h"
17#include "AMDGPUAliasAnalysis.h"
18#include "AMDGPUExportClustering.h"
19#include "AMDGPUIGroupLP.h"
20#include "AMDGPUMacroFusion.h"
21#include "AMDGPUTargetObjectFile.h"
22#include "AMDGPUTargetTransformInfo.h"
23#include "GCNIterativeScheduler.h"
24#include "GCNSchedStrategy.h"
25#include "GCNVOPDUtils.h"
26#include "R600.h"
27#include "R600TargetMachine.h"
28#include "SIMachineFunctionInfo.h"
29#include "SIMachineScheduler.h"
30#include "TargetInfo/AMDGPUTargetInfo.h"
31#include "Utils/AMDGPUBaseInfo.h"
32#include "llvm/Analysis/CGSCCPassManager.h"
33#include "llvm/CodeGen/GlobalISel/CSEInfo.h"
34#include "llvm/CodeGen/GlobalISel/IRTranslator.h"
35#include "llvm/CodeGen/GlobalISel/InstructionSelect.h"
36#include "llvm/CodeGen/GlobalISel/Legalizer.h"
37#include "llvm/CodeGen/GlobalISel/Localizer.h"
38#include "llvm/CodeGen/GlobalISel/RegBankSelect.h"
39#include "llvm/CodeGen/MIRParser/MIParser.h"
40#include "llvm/CodeGen/Passes.h"
41#include "llvm/CodeGen/RegAllocRegistry.h"
42#include "llvm/CodeGen/TargetPassConfig.h"
43#include "llvm/IR/IntrinsicsAMDGPU.h"
44#include "llvm/IR/LegacyPassManager.h"
45#include "llvm/IR/PassManager.h"
46#include "llvm/IR/PatternMatch.h"
47#include "llvm/InitializePasses.h"
48#include "llvm/MC/TargetRegistry.h"
49#include "llvm/Passes/PassBuilder.h"
50#include "llvm/Transforms/IPO.h"
51#include "llvm/Transforms/IPO/AlwaysInliner.h"
52#include "llvm/Transforms/IPO/GlobalDCE.h"
53#include "llvm/Transforms/IPO/Internalize.h"
54#include "llvm/Transforms/IPO/PassManagerBuilder.h"
55#include "llvm/Transforms/Scalar.h"
56#include "llvm/Transforms/Scalar/GVN.h"
57#include "llvm/Transforms/Scalar/InferAddressSpaces.h"
58#include "llvm/Transforms/Utils.h"
59#include "llvm/Transforms/Utils/SimplifyLibCalls.h"
60#include "llvm/Transforms/Vectorize.h"
61
62using namespace llvm;
63using namespace llvm::PatternMatch;
64
65namespace {
66class SGPRRegisterRegAlloc : public RegisterRegAllocBase<SGPRRegisterRegAlloc> {
67public:
68 SGPRRegisterRegAlloc(const char *N, const char *D, FunctionPassCtor C)
69 : RegisterRegAllocBase(N, D, C) {}
70};
71
72class VGPRRegisterRegAlloc : public RegisterRegAllocBase<VGPRRegisterRegAlloc> {
73public:
74 VGPRRegisterRegAlloc(const char *N, const char *D, FunctionPassCtor C)
75 : RegisterRegAllocBase(N, D, C) {}
76};
77
78static bool onlyAllocateSGPRs(const TargetRegisterInfo &TRI,
79 const TargetRegisterClass &RC) {
80 return static_cast<const SIRegisterInfo &>(TRI).isSGPRClass(&RC);
81}
82
83static bool onlyAllocateVGPRs(const TargetRegisterInfo &TRI,
84 const TargetRegisterClass &RC) {
85 return !static_cast<const SIRegisterInfo &>(TRI).isSGPRClass(&RC);
86}
87
88
89/// -{sgpr|vgpr}-regalloc=... command line option.
90static FunctionPass *useDefaultRegisterAllocator() { return nullptr; }
91
92/// A dummy default pass factory indicates whether the register allocator is
93/// overridden on the command line.
94static llvm::once_flag InitializeDefaultSGPRRegisterAllocatorFlag;
95static llvm::once_flag InitializeDefaultVGPRRegisterAllocatorFlag;
96
97static SGPRRegisterRegAlloc
98defaultSGPRRegAlloc("default",
99 "pick SGPR register allocator based on -O option",
100 useDefaultRegisterAllocator);
101
102static cl::opt<SGPRRegisterRegAlloc::FunctionPassCtor, false,
103 RegisterPassParser<SGPRRegisterRegAlloc>>
104SGPRRegAlloc("sgpr-regalloc", cl::Hidden, cl::init(&useDefaultRegisterAllocator),
105 cl::desc("Register allocator to use for SGPRs"));
106
107static cl::opt<VGPRRegisterRegAlloc::FunctionPassCtor, false,
108 RegisterPassParser<VGPRRegisterRegAlloc>>
109VGPRRegAlloc("vgpr-regalloc", cl::Hidden, cl::init(&useDefaultRegisterAllocator),
110 cl::desc("Register allocator to use for VGPRs"));
111
112
113static void initializeDefaultSGPRRegisterAllocatorOnce() {
114 RegisterRegAlloc::FunctionPassCtor Ctor = SGPRRegisterRegAlloc::getDefault();
115
116 if (!Ctor) {
117 Ctor = SGPRRegAlloc;
Value stored to 'Ctor' is never read
118 SGPRRegisterRegAlloc::setDefault(SGPRRegAlloc);
119 }
120}
121
122static void initializeDefaultVGPRRegisterAllocatorOnce() {
123 RegisterRegAlloc::FunctionPassCtor Ctor = VGPRRegisterRegAlloc::getDefault();
124
125 if (!Ctor) {
126 Ctor = VGPRRegAlloc;
127 VGPRRegisterRegAlloc::setDefault(VGPRRegAlloc);
128 }
129}
130
131static FunctionPass *createBasicSGPRRegisterAllocator() {
132 return createBasicRegisterAllocator(onlyAllocateSGPRs);
133}
134
135static FunctionPass *createGreedySGPRRegisterAllocator() {
136 return createGreedyRegisterAllocator(onlyAllocateSGPRs);
137}
138
139static FunctionPass *createFastSGPRRegisterAllocator() {
140 return createFastRegisterAllocator(onlyAllocateSGPRs, false);
141}
142
143static FunctionPass *createBasicVGPRRegisterAllocator() {
144 return createBasicRegisterAllocator(onlyAllocateVGPRs);
145}
146
147static FunctionPass *createGreedyVGPRRegisterAllocator() {
148 return createGreedyRegisterAllocator(onlyAllocateVGPRs);
149}
150
151static FunctionPass *createFastVGPRRegisterAllocator() {
152 return createFastRegisterAllocator(onlyAllocateVGPRs, true);
153}
154
155static SGPRRegisterRegAlloc basicRegAllocSGPR(
156 "basic", "basic register allocator", createBasicSGPRRegisterAllocator);
157static SGPRRegisterRegAlloc greedyRegAllocSGPR(
158 "greedy", "greedy register allocator", createGreedySGPRRegisterAllocator);
159
160static SGPRRegisterRegAlloc fastRegAllocSGPR(
161 "fast", "fast register allocator", createFastSGPRRegisterAllocator);
162
163
164static VGPRRegisterRegAlloc basicRegAllocVGPR(
165 "basic", "basic register allocator", createBasicVGPRRegisterAllocator);
166static VGPRRegisterRegAlloc greedyRegAllocVGPR(
167 "greedy", "greedy register allocator", createGreedyVGPRRegisterAllocator);
168
169static VGPRRegisterRegAlloc fastRegAllocVGPR(
170 "fast", "fast register allocator", createFastVGPRRegisterAllocator);
171}
172
173static cl::opt<bool> EnableSROA(
174 "amdgpu-sroa",
175 cl::desc("Run SROA after promote alloca pass"),
176 cl::ReallyHidden,
177 cl::init(true));
178
179static cl::opt<bool>
180EnableEarlyIfConversion("amdgpu-early-ifcvt", cl::Hidden,
181 cl::desc("Run early if-conversion"),
182 cl::init(false));
183
184static cl::opt<bool>
185OptExecMaskPreRA("amdgpu-opt-exec-mask-pre-ra", cl::Hidden,
186 cl::desc("Run pre-RA exec mask optimizations"),
187 cl::init(true));
188
189// Option to disable vectorizer for tests.
190static cl::opt<bool> EnableLoadStoreVectorizer(
191 "amdgpu-load-store-vectorizer",
192 cl::desc("Enable load store vectorizer"),
193 cl::init(true),
194 cl::Hidden);
195
196// Option to control global loads scalarization
197static cl::opt<bool> ScalarizeGlobal(
198 "amdgpu-scalarize-global-loads",
199 cl::desc("Enable global load scalarization"),
200 cl::init(true),
201 cl::Hidden);
202
203// Option to run internalize pass.
204static cl::opt<bool> InternalizeSymbols(
205 "amdgpu-internalize-symbols",
206 cl::desc("Enable elimination of non-kernel functions and unused globals"),
207 cl::init(false),
208 cl::Hidden);
209
210// Option to inline all early.
211static cl::opt<bool> EarlyInlineAll(
212 "amdgpu-early-inline-all",
213 cl::desc("Inline all functions early"),
214 cl::init(false),
215 cl::Hidden);
216
217static cl::opt<bool> EnableSDWAPeephole(
218 "amdgpu-sdwa-peephole",
219 cl::desc("Enable SDWA peepholer"),
220 cl::init(true));
221
222static cl::opt<bool> EnableDPPCombine(
223 "amdgpu-dpp-combine",
224 cl::desc("Enable DPP combiner"),
225 cl::init(true));
226
227// Enable address space based alias analysis
228static cl::opt<bool> EnableAMDGPUAliasAnalysis("enable-amdgpu-aa", cl::Hidden,
229 cl::desc("Enable AMDGPU Alias Analysis"),
230 cl::init(true));
231
232// Option to run late CFG structurizer
233static cl::opt<bool, true> LateCFGStructurize(
234 "amdgpu-late-structurize",
235 cl::desc("Enable late CFG structurization"),
236 cl::location(AMDGPUTargetMachine::EnableLateStructurizeCFG),
237 cl::Hidden);
238
239// Enable lib calls simplifications
240static cl::opt<bool> EnableLibCallSimplify(
241 "amdgpu-simplify-libcall",
242 cl::desc("Enable amdgpu library simplifications"),
243 cl::init(true),
244 cl::Hidden);
245
246static cl::opt<bool> EnableLowerKernelArguments(
247 "amdgpu-ir-lower-kernel-arguments",
248 cl::desc("Lower kernel argument loads in IR pass"),
249 cl::init(true),
250 cl::Hidden);
251
252static cl::opt<bool> EnableRegReassign(
253 "amdgpu-reassign-regs",
254 cl::desc("Enable register reassign optimizations on gfx10+"),
255 cl::init(true),
256 cl::Hidden);
257
258static cl::opt<bool> OptVGPRLiveRange(
259 "amdgpu-opt-vgpr-liverange",
260 cl::desc("Enable VGPR liverange optimizations for if-else structure"),
261 cl::init(true), cl::Hidden);
262
263// Enable atomic optimization
264static cl::opt<bool> EnableAtomicOptimizations(
265 "amdgpu-atomic-optimizations",
266 cl::desc("Enable atomic optimizations"),
267 cl::init(false),
268 cl::Hidden);
269
270// Enable Mode register optimization
271static cl::opt<bool> EnableSIModeRegisterPass(
272 "amdgpu-mode-register",
273 cl::desc("Enable mode register pass"),
274 cl::init(true),
275 cl::Hidden);
276
277// Enable GFX11+ s_delay_alu insertion
278static cl::opt<bool>
279 EnableInsertDelayAlu("amdgpu-enable-delay-alu",
280 cl::desc("Enable s_delay_alu insertion"),
281 cl::init(true), cl::Hidden);
282
283// Enable GFX11+ VOPD
284static cl::opt<bool>
285 EnableVOPD("amdgpu-enable-vopd",
286 cl::desc("Enable VOPD, dual issue of VALU in wave32"),
287 cl::init(true), cl::Hidden);
288
289// Option is used in lit tests to prevent deadcoding of patterns inspected.
290static cl::opt<bool>
291EnableDCEInRA("amdgpu-dce-in-ra",
292 cl::init(true), cl::Hidden,
293 cl::desc("Enable machine DCE inside regalloc"));
294
295static cl::opt<bool> EnableSetWavePriority("amdgpu-set-wave-priority",
296 cl::desc("Adjust wave priority"),
297 cl::init(false), cl::Hidden);
298
299static cl::opt<bool> EnableScalarIRPasses(
300 "amdgpu-scalar-ir-passes",
301 cl::desc("Enable scalar IR passes"),
302 cl::init(true),
303 cl::Hidden);
304
305static cl::opt<bool> EnableStructurizerWorkarounds(
306 "amdgpu-enable-structurizer-workarounds",
307 cl::desc("Enable workarounds for the StructurizeCFG pass"), cl::init(true),
308 cl::Hidden);
309
310static cl::opt<bool> EnableLDSReplaceWithPointer(
311 "amdgpu-enable-lds-replace-with-pointer",
312 cl::desc("Enable LDS replace with pointer pass"), cl::init(false),
313 cl::Hidden);
314
315static cl::opt<bool, true> EnableLowerModuleLDS(
316 "amdgpu-enable-lower-module-lds", cl::desc("Enable lower module lds pass"),
317 cl::location(AMDGPUTargetMachine::EnableLowerModuleLDS), cl::init(true),
318 cl::Hidden);
319
320static cl::opt<bool> EnablePreRAOptimizations(
321 "amdgpu-enable-pre-ra-optimizations",
322 cl::desc("Enable Pre-RA optimizations pass"), cl::init(true),
323 cl::Hidden);
324
325static cl::opt<bool> EnablePromoteKernelArguments(
326 "amdgpu-enable-promote-kernel-arguments",
327 cl::desc("Enable promotion of flat kernel pointer arguments to global"),
328 cl::Hidden, cl::init(true));
329
330static cl::opt<bool> EnableMaxIlpSchedStrategy(
331 "amdgpu-enable-max-ilp-scheduling-strategy",
332 cl::desc("Enable scheduling strategy to maximize ILP for a single wave."),
333 cl::Hidden, cl::init(false));
334
335extern "C" LLVM_EXTERNAL_VISIBILITY__attribute__((visibility("default"))) void LLVMInitializeAMDGPUTarget() {
336 // Register the target
337 RegisterTargetMachine<R600TargetMachine> X(getTheAMDGPUTarget());
338 RegisterTargetMachine<GCNTargetMachine> Y(getTheGCNTarget());
339
340 PassRegistry *PR = PassRegistry::getPassRegistry();
341 initializeR600ClauseMergePassPass(*PR);
342 initializeR600ControlFlowFinalizerPass(*PR);
343 initializeR600PacketizerPass(*PR);
344 initializeR600ExpandSpecialInstrsPassPass(*PR);
345 initializeR600VectorRegMergerPass(*PR);
346 initializeGlobalISel(*PR);
347 initializeAMDGPUDAGToDAGISelPass(*PR);
348 initializeGCNDPPCombinePass(*PR);
349 initializeSILowerI1CopiesPass(*PR);
350 initializeSILowerSGPRSpillsPass(*PR);
351 initializeSIFixSGPRCopiesPass(*PR);
352 initializeSIFixVGPRCopiesPass(*PR);
353 initializeSIFoldOperandsPass(*PR);
354 initializeSIPeepholeSDWAPass(*PR);
355 initializeSIShrinkInstructionsPass(*PR);
356 initializeSIOptimizeExecMaskingPreRAPass(*PR);
357 initializeSIOptimizeVGPRLiveRangePass(*PR);
358 initializeSILoadStoreOptimizerPass(*PR);
359 initializeAMDGPUCtorDtorLoweringPass(*PR);
360 initializeAMDGPUAlwaysInlinePass(*PR);
361 initializeAMDGPUAttributorPass(*PR);
362 initializeAMDGPUAnnotateKernelFeaturesPass(*PR);
363 initializeAMDGPUAnnotateUniformValuesPass(*PR);
364 initializeAMDGPUArgumentUsageInfoPass(*PR);
365 initializeAMDGPUAtomicOptimizerPass(*PR);
366 initializeAMDGPULowerKernelArgumentsPass(*PR);
367 initializeAMDGPUPromoteKernelArgumentsPass(*PR);
368 initializeAMDGPULowerKernelAttributesPass(*PR);
369 initializeAMDGPULowerIntrinsicsPass(*PR);
370 initializeAMDGPUOpenCLEnqueuedBlockLoweringPass(*PR);
371 initializeAMDGPUPostLegalizerCombinerPass(*PR);
372 initializeAMDGPUPreLegalizerCombinerPass(*PR);
373 initializeAMDGPURegBankCombinerPass(*PR);
374 initializeAMDGPUPromoteAllocaPass(*PR);
375 initializeAMDGPUPromoteAllocaToVectorPass(*PR);
376 initializeAMDGPUCodeGenPreparePass(*PR);
377 initializeAMDGPULateCodeGenPreparePass(*PR);
378 initializeAMDGPUPropagateAttributesEarlyPass(*PR);
379 initializeAMDGPUPropagateAttributesLatePass(*PR);
380 initializeAMDGPUReplaceLDSUseWithPointerPass(*PR);
381 initializeAMDGPULowerModuleLDSPass(*PR);
382 initializeAMDGPURewriteOutArgumentsPass(*PR);
383 initializeAMDGPURewriteUndefForPHIPass(*PR);
384 initializeAMDGPUUnifyMetadataPass(*PR);
385 initializeSIAnnotateControlFlowPass(*PR);
386 initializeAMDGPUReleaseVGPRsPass(*PR);
387 initializeAMDGPUInsertDelayAluPass(*PR);
388 initializeSIInsertHardClausesPass(*PR);
389 initializeSIInsertWaitcntsPass(*PR);
390 initializeSIModeRegisterPass(*PR);
391 initializeSIWholeQuadModePass(*PR);
392 initializeSILowerControlFlowPass(*PR);
393 initializeSIPreEmitPeepholePass(*PR);
394 initializeSILateBranchLoweringPass(*PR);
395 initializeSIMemoryLegalizerPass(*PR);
396 initializeSIOptimizeExecMaskingPass(*PR);
397 initializeSIPreAllocateWWMRegsPass(*PR);
398 initializeSIFormMemoryClausesPass(*PR);
399 initializeSIPostRABundlerPass(*PR);
400 initializeGCNCreateVOPDPass(*PR);
401 initializeAMDGPUUnifyDivergentExitNodesPass(*PR);
402 initializeAMDGPUAAWrapperPassPass(*PR);
403 initializeAMDGPUExternalAAWrapperPass(*PR);
404 initializeAMDGPUUseNativeCallsPass(*PR);
405 initializeAMDGPUSimplifyLibCallsPass(*PR);
406 initializeAMDGPUPrintfRuntimeBindingPass(*PR);
407 initializeAMDGPUResourceUsageAnalysisPass(*PR);
408 initializeGCNNSAReassignPass(*PR);
409 initializeGCNPreRAOptimizationsPass(*PR);
410}
411
412static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) {
413 return std::make_unique<AMDGPUTargetObjectFile>();
414}
415
416static ScheduleDAGInstrs *createSIMachineScheduler(MachineSchedContext *C) {
417 return new SIScheduleDAGMI(C);
418}
419
420static ScheduleDAGInstrs *
421createGCNMaxOccupancyMachineScheduler(MachineSchedContext *C) {
422 const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>();
423 ScheduleDAGMILive *DAG =
424 new GCNScheduleDAGMILive(C, std::make_unique<GCNMaxOccupancySchedStrategy>(C));
425 DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
426 if (ST.shouldClusterStores())
427 DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
428 DAG->addMutation(createIGroupLPDAGMutation());
429 DAG->addMutation(createAMDGPUMacroFusionDAGMutation());
430 DAG->addMutation(createAMDGPUExportClusteringDAGMutation());
431 return DAG;
432}
433
434static ScheduleDAGInstrs *
435createGCNMaxILPMachineScheduler(MachineSchedContext *C) {
436 ScheduleDAGMILive *DAG =
437 new GCNScheduleDAGMILive(C, std::make_unique<GCNMaxILPSchedStrategy>(C));
438 DAG->addMutation(createIGroupLPDAGMutation());
439 return DAG;
440}
441
442static ScheduleDAGInstrs *
443createIterativeGCNMaxOccupancyMachineScheduler(MachineSchedContext *C) {
444 const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>();
445 auto DAG = new GCNIterativeScheduler(C,
446 GCNIterativeScheduler::SCHEDULE_LEGACYMAXOCCUPANCY);
447 DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
448 if (ST.shouldClusterStores())
449 DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
450 return DAG;
451}
452
453static ScheduleDAGInstrs *createMinRegScheduler(MachineSchedContext *C) {
454 return new GCNIterativeScheduler(C,
455 GCNIterativeScheduler::SCHEDULE_MINREGFORCED);
456}
457
458static ScheduleDAGInstrs *
459createIterativeILPMachineScheduler(MachineSchedContext *C) {
460 const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>();
461 auto DAG = new GCNIterativeScheduler(C,
462 GCNIterativeScheduler::SCHEDULE_ILP);
463 DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
464 if (ST.shouldClusterStores())
465 DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
466 DAG->addMutation(createAMDGPUMacroFusionDAGMutation());
467 return DAG;
468}
469
470static MachineSchedRegistry
471SISchedRegistry("si", "Run SI's custom scheduler",
472 createSIMachineScheduler);
473
474static MachineSchedRegistry
475GCNMaxOccupancySchedRegistry("gcn-max-occupancy",
476 "Run GCN scheduler to maximize occupancy",
477 createGCNMaxOccupancyMachineScheduler);
478
479static MachineSchedRegistry
480 GCNMaxILPSchedRegistry("gcn-max-ilp", "Run GCN scheduler to maximize ilp",
481 createGCNMaxILPMachineScheduler);
482
483static MachineSchedRegistry IterativeGCNMaxOccupancySchedRegistry(
484 "gcn-iterative-max-occupancy-experimental",
485 "Run GCN scheduler to maximize occupancy (experimental)",
486 createIterativeGCNMaxOccupancyMachineScheduler);
487
488static MachineSchedRegistry GCNMinRegSchedRegistry(
489 "gcn-iterative-minreg",
490 "Run GCN iterative scheduler for minimal register usage (experimental)",
491 createMinRegScheduler);
492
493static MachineSchedRegistry GCNILPSchedRegistry(
494 "gcn-iterative-ilp",
495 "Run GCN iterative scheduler for ILP scheduling (experimental)",
496 createIterativeILPMachineScheduler);
497
498static StringRef computeDataLayout(const Triple &TT) {
499 if (TT.getArch() == Triple::r600) {
500 // 32-bit pointers.
501 return "e-p:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128"
502 "-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1";
503 }
504
505 // 32-bit private, local, and region pointers. 64-bit global, constant and
506 // flat, non-integral buffer fat pointers.
507 return "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32"
508 "-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128"
509 "-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1"
510 "-ni:7";
511}
512
513LLVM_READNONE__attribute__((__const__))
514static StringRef getGPUOrDefault(const Triple &TT, StringRef GPU) {
515 if (!GPU.empty())
516 return GPU;
517
518 // Need to default to a target with flat support for HSA.
519 if (TT.getArch() == Triple::amdgcn)
520 return TT.getOS() == Triple::AMDHSA ? "generic-hsa" : "generic";
521
522 return "r600";
523}
524
525static Reloc::Model getEffectiveRelocModel(Optional<Reloc::Model> RM) {
526 // The AMDGPU toolchain only supports generating shared objects, so we
527 // must always use PIC.
528 return Reloc::PIC_;
529}
530
531AMDGPUTargetMachine::AMDGPUTargetMachine(const Target &T, const Triple &TT,
532 StringRef CPU, StringRef FS,
533 TargetOptions Options,
534 Optional<Reloc::Model> RM,
535 Optional<CodeModel::Model> CM,
536 CodeGenOpt::Level OptLevel)
537 : LLVMTargetMachine(T, computeDataLayout(TT), TT, getGPUOrDefault(TT, CPU),
538 FS, Options, getEffectiveRelocModel(RM),
539 getEffectiveCodeModel(CM, CodeModel::Small), OptLevel),
540 TLOF(createTLOF(getTargetTriple())) {
541 initAsmInfo();
542 if (TT.getArch() == Triple::amdgcn) {
543 if (getMCSubtargetInfo()->checkFeatures("+wavefrontsize64"))
544 MRI.reset(llvm::createGCNMCRegisterInfo(AMDGPUDwarfFlavour::Wave64));
545 else if (getMCSubtargetInfo()->checkFeatures("+wavefrontsize32"))
546 MRI.reset(llvm::createGCNMCRegisterInfo(AMDGPUDwarfFlavour::Wave32));
547 }
548}
549
550bool AMDGPUTargetMachine::EnableLateStructurizeCFG = false;
551bool AMDGPUTargetMachine::EnableFunctionCalls = false;
552bool AMDGPUTargetMachine::EnableLowerModuleLDS = true;
553
554AMDGPUTargetMachine::~AMDGPUTargetMachine() = default;
555
556StringRef AMDGPUTargetMachine::getGPUName(const Function &F) const {
557 Attribute GPUAttr = F.getFnAttribute("target-cpu");
558 return GPUAttr.isValid() ? GPUAttr.getValueAsString() : getTargetCPU();
559}
560
561StringRef AMDGPUTargetMachine::getFeatureString(const Function &F) const {
562 Attribute FSAttr = F.getFnAttribute("target-features");
563
564 return FSAttr.isValid() ? FSAttr.getValueAsString()
565 : getTargetFeatureString();
566}
567
568/// Predicate for Internalize pass.
569static bool mustPreserveGV(const GlobalValue &GV) {
570 if (const Function *F = dyn_cast<Function>(&GV))
571 return F->isDeclaration() || F->getName().startswith("__asan_") ||
572 F->getName().startswith("__sanitizer_") ||
573 AMDGPU::isEntryFunctionCC(F->getCallingConv());
574
575 GV.removeDeadConstantUsers();
576 return !GV.use_empty();
577}
578
579void AMDGPUTargetMachine::adjustPassManager(PassManagerBuilder &Builder) {
580 Builder.DivergentTarget = true;
581
582 bool EnableOpt = getOptLevel() > CodeGenOpt::None;
583 bool Internalize = InternalizeSymbols;
584 bool EarlyInline = EarlyInlineAll && EnableOpt && !EnableFunctionCalls;
585 bool AMDGPUAA = EnableAMDGPUAliasAnalysis && EnableOpt;
586 bool LibCallSimplify = EnableLibCallSimplify && EnableOpt;
587 bool PromoteKernelArguments =
588 EnablePromoteKernelArguments && getOptLevel() > CodeGenOpt::Less;
589
590 if (EnableFunctionCalls) {
591 delete Builder.Inliner;
592 Builder.Inliner = createFunctionInliningPass();
593 }
594
595 Builder.addExtension(
596 PassManagerBuilder::EP_ModuleOptimizerEarly,
597 [Internalize, EarlyInline, AMDGPUAA, this](const PassManagerBuilder &,
598 legacy::PassManagerBase &PM) {
599 if (AMDGPUAA) {
600 PM.add(createAMDGPUAAWrapperPass());
601 PM.add(createAMDGPUExternalAAWrapperPass());
602 }
603 PM.add(createAMDGPUUnifyMetadataPass());
604 PM.add(createAMDGPUPrintfRuntimeBinding());
605 if (Internalize)
606 PM.add(createInternalizePass(mustPreserveGV));
607 PM.add(createAMDGPUPropagateAttributesLatePass(this));
608 if (Internalize)
609 PM.add(createGlobalDCEPass());
610 if (EarlyInline)
611 PM.add(createAMDGPUAlwaysInlinePass(false));
612 });
613
614 Builder.addExtension(
615 PassManagerBuilder::EP_EarlyAsPossible,
616 [AMDGPUAA, LibCallSimplify, this](const PassManagerBuilder &,
617 legacy::PassManagerBase &PM) {
618 if (AMDGPUAA) {
619 PM.add(createAMDGPUAAWrapperPass());
620 PM.add(createAMDGPUExternalAAWrapperPass());
621 }
622 PM.add(llvm::createAMDGPUPropagateAttributesEarlyPass(this));
623 PM.add(llvm::createAMDGPUUseNativeCallsPass());
624 if (LibCallSimplify)
625 PM.add(llvm::createAMDGPUSimplifyLibCallsPass(this));
626 });
627
628 Builder.addExtension(
629 PassManagerBuilder::EP_CGSCCOptimizerLate,
630 [EnableOpt, PromoteKernelArguments](const PassManagerBuilder &,
631 legacy::PassManagerBase &PM) {
632 // Add promote kernel arguments pass to the opt pipeline right before
633 // infer address spaces which is needed to do actual address space
634 // rewriting.
635 if (PromoteKernelArguments)
636 PM.add(createAMDGPUPromoteKernelArgumentsPass());
637
638 // Add infer address spaces pass to the opt pipeline after inlining
639 // but before SROA to increase SROA opportunities.
640 PM.add(createInferAddressSpacesPass());
641
642 // This should run after inlining to have any chance of doing anything,
643 // and before other cleanup optimizations.
644 PM.add(createAMDGPULowerKernelAttributesPass());
645
646 // Promote alloca to vector before SROA and loop unroll. If we manage
647 // to eliminate allocas before unroll we may choose to unroll less.
648 if (EnableOpt)
649 PM.add(createAMDGPUPromoteAllocaToVector());
650 });
651}
652
653void AMDGPUTargetMachine::registerDefaultAliasAnalyses(AAManager &AAM) {
654 AAM.registerFunctionAnalysis<AMDGPUAA>();
655}
656
657void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) {
658 PB.registerPipelineParsingCallback(
659 [this](StringRef PassName, ModulePassManager &PM,
660 ArrayRef<PassBuilder::PipelineElement>) {
661 if (PassName == "amdgpu-propagate-attributes-late") {
662 PM.addPass(AMDGPUPropagateAttributesLatePass(*this));
663 return true;
664 }
665 if (PassName == "amdgpu-unify-metadata") {
666 PM.addPass(AMDGPUUnifyMetadataPass());
667 return true;
668 }
669 if (PassName == "amdgpu-printf-runtime-binding") {
670 PM.addPass(AMDGPUPrintfRuntimeBindingPass());
671 return true;
672 }
673 if (PassName == "amdgpu-always-inline") {
674 PM.addPass(AMDGPUAlwaysInlinePass());
675 return true;
676 }
677 if (PassName == "amdgpu-replace-lds-use-with-pointer") {
678 PM.addPass(AMDGPUReplaceLDSUseWithPointerPass());
679 return true;
680 }
681 if (PassName == "amdgpu-lower-module-lds") {
682 PM.addPass(AMDGPULowerModuleLDSPass());
683 return true;
684 }
685 return false;
686 });
687 PB.registerPipelineParsingCallback(
688 [this](StringRef PassName, FunctionPassManager &PM,
689 ArrayRef<PassBuilder::PipelineElement>) {
690 if (PassName == "amdgpu-simplifylib") {
691 PM.addPass(AMDGPUSimplifyLibCallsPass(*this));
692 return true;
693 }
694 if (PassName == "amdgpu-usenative") {
695 PM.addPass(AMDGPUUseNativeCallsPass());
696 return true;
697 }
698 if (PassName == "amdgpu-promote-alloca") {
699 PM.addPass(AMDGPUPromoteAllocaPass(*this));
700 return true;
701 }
702 if (PassName == "amdgpu-promote-alloca-to-vector") {
703 PM.addPass(AMDGPUPromoteAllocaToVectorPass(*this));
704 return true;
705 }
706 if (PassName == "amdgpu-lower-kernel-attributes") {
707 PM.addPass(AMDGPULowerKernelAttributesPass());
708 return true;
709 }
710 if (PassName == "amdgpu-propagate-attributes-early") {
711 PM.addPass(AMDGPUPropagateAttributesEarlyPass(*this));
712 return true;
713 }
714 if (PassName == "amdgpu-promote-kernel-arguments") {
715 PM.addPass(AMDGPUPromoteKernelArgumentsPass());
716 return true;
717 }
718 return false;
719 });
720
721 PB.registerAnalysisRegistrationCallback([](FunctionAnalysisManager &FAM) {
722 FAM.registerPass([&] { return AMDGPUAA(); });
723 });
724
725 PB.registerParseAACallback([](StringRef AAName, AAManager &AAM) {
726 if (AAName == "amdgpu-aa") {
727 AAM.registerFunctionAnalysis<AMDGPUAA>();
728 return true;
729 }
730 return false;
731 });
732
733 PB.registerPipelineStartEPCallback(
734 [this](ModulePassManager &PM, OptimizationLevel Level) {
735 FunctionPassManager FPM;
736 FPM.addPass(AMDGPUPropagateAttributesEarlyPass(*this));
737 FPM.addPass(AMDGPUUseNativeCallsPass());
738 if (EnableLibCallSimplify && Level != OptimizationLevel::O0)
739 FPM.addPass(AMDGPUSimplifyLibCallsPass(*this));
740 PM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM)));
741 });
742
743 PB.registerPipelineEarlySimplificationEPCallback(
744 [this](ModulePassManager &PM, OptimizationLevel Level) {
745 if (Level == OptimizationLevel::O0)
746 return;
747
748 PM.addPass(AMDGPUUnifyMetadataPass());
749 PM.addPass(AMDGPUPrintfRuntimeBindingPass());
750
751 if (InternalizeSymbols) {
752 PM.addPass(InternalizePass(mustPreserveGV));
753 }
754 PM.addPass(AMDGPUPropagateAttributesLatePass(*this));
755 if (InternalizeSymbols) {
756 PM.addPass(GlobalDCEPass());
757 }
758 if (EarlyInlineAll && !EnableFunctionCalls)
759 PM.addPass(AMDGPUAlwaysInlinePass());
760 });
761
762 PB.registerCGSCCOptimizerLateEPCallback(
763 [this](CGSCCPassManager &PM, OptimizationLevel Level) {
764 if (Level == OptimizationLevel::O0)
765 return;
766
767 FunctionPassManager FPM;
768
769 // Add promote kernel arguments pass to the opt pipeline right before
770 // infer address spaces which is needed to do actual address space
771 // rewriting.
772 if (Level.getSpeedupLevel() > OptimizationLevel::O1.getSpeedupLevel() &&
773 EnablePromoteKernelArguments)
774 FPM.addPass(AMDGPUPromoteKernelArgumentsPass());
775
776 // Add infer address spaces pass to the opt pipeline after inlining
777 // but before SROA to increase SROA opportunities.
778 FPM.addPass(InferAddressSpacesPass());
779
780 // This should run after inlining to have any chance of doing
781 // anything, and before other cleanup optimizations.
782 FPM.addPass(AMDGPULowerKernelAttributesPass());
783
784 if (Level != OptimizationLevel::O0) {
785 // Promote alloca to vector before SROA and loop unroll. If we
786 // manage to eliminate allocas before unroll we may choose to unroll
787 // less.
788 FPM.addPass(AMDGPUPromoteAllocaToVectorPass(*this));
789 }
790
791 PM.addPass(createCGSCCToFunctionPassAdaptor(std::move(FPM)));
792 });
793}
794
795int64_t AMDGPUTargetMachine::getNullPointerValue(unsigned AddrSpace) {
796 return (AddrSpace == AMDGPUAS::LOCAL_ADDRESS ||
797 AddrSpace == AMDGPUAS::PRIVATE_ADDRESS ||
798 AddrSpace == AMDGPUAS::REGION_ADDRESS)
799 ? -1
800 : 0;
801}
802
803bool AMDGPUTargetMachine::isNoopAddrSpaceCast(unsigned SrcAS,
804 unsigned DestAS) const {
805 return AMDGPU::isFlatGlobalAddrSpace(SrcAS) &&
806 AMDGPU::isFlatGlobalAddrSpace(DestAS);
807}
808
809unsigned AMDGPUTargetMachine::getAssumedAddrSpace(const Value *V) const {
810 const auto *LD = dyn_cast<LoadInst>(V);
811 if (!LD)
812 return AMDGPUAS::UNKNOWN_ADDRESS_SPACE;
813
814 // It must be a generic pointer loaded.
815 assert(V->getType()->isPointerTy() &&(static_cast <bool> (V->getType()->isPointerTy() &&
V->getType()->getPointerAddressSpace() == AMDGPUAS::FLAT_ADDRESS
) ? void (0) : __assert_fail ("V->getType()->isPointerTy() && V->getType()->getPointerAddressSpace() == AMDGPUAS::FLAT_ADDRESS"
, "llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp", 816, __extension__
__PRETTY_FUNCTION__))
816 V->getType()->getPointerAddressSpace() == AMDGPUAS::FLAT_ADDRESS)(static_cast <bool> (V->getType()->isPointerTy() &&
V->getType()->getPointerAddressSpace() == AMDGPUAS::FLAT_ADDRESS
) ? void (0) : __assert_fail ("V->getType()->isPointerTy() && V->getType()->getPointerAddressSpace() == AMDGPUAS::FLAT_ADDRESS"
, "llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp", 816, __extension__
__PRETTY_FUNCTION__))
;
817
818 const auto *Ptr = LD->getPointerOperand();
819 if (Ptr->getType()->getPointerAddressSpace() != AMDGPUAS::CONSTANT_ADDRESS)
820 return AMDGPUAS::UNKNOWN_ADDRESS_SPACE;
821 // For a generic pointer loaded from the constant memory, it could be assumed
822 // as a global pointer since the constant memory is only populated on the
823 // host side. As implied by the offload programming model, only global
824 // pointers could be referenced on the host side.
825 return AMDGPUAS::GLOBAL_ADDRESS;
826}
827
828std::pair<const Value *, unsigned>
829AMDGPUTargetMachine::getPredicatedAddrSpace(const Value *V) const {
830 if (auto *II = dyn_cast<IntrinsicInst>(V)) {
831 switch (II->getIntrinsicID()) {
832 case Intrinsic::amdgcn_is_shared:
833 return std::make_pair(II->getArgOperand(0), AMDGPUAS::LOCAL_ADDRESS);
834 case Intrinsic::amdgcn_is_private:
835 return std::make_pair(II->getArgOperand(0), AMDGPUAS::PRIVATE_ADDRESS);
836 default:
837 break;
838 }
839 return std::make_pair(nullptr, -1);
840 }
841 // Check the global pointer predication based on
842 // (!is_share(p) && !is_private(p)). Note that logic 'and' is commutative and
843 // the order of 'is_shared' and 'is_private' is not significant.
844 Value *Ptr;
845 if (match(
846 const_cast<Value *>(V),
847 m_c_And(m_Not(m_Intrinsic<Intrinsic::amdgcn_is_shared>(m_Value(Ptr))),
848 m_Not(m_Intrinsic<Intrinsic::amdgcn_is_private>(
849 m_Deferred(Ptr))))))
850 return std::make_pair(Ptr, AMDGPUAS::GLOBAL_ADDRESS);
851
852 return std::make_pair(nullptr, -1);
853}
854
855unsigned
856AMDGPUTargetMachine::getAddressSpaceForPseudoSourceKind(unsigned Kind) const {
857 switch (Kind) {
858 case PseudoSourceValue::Stack:
859 case PseudoSourceValue::FixedStack:
860 return AMDGPUAS::PRIVATE_ADDRESS;
861 case PseudoSourceValue::ConstantPool:
862 case PseudoSourceValue::GOT:
863 case PseudoSourceValue::JumpTable:
864 case PseudoSourceValue::GlobalValueCallEntry:
865 case PseudoSourceValue::ExternalSymbolCallEntry:
866 case PseudoSourceValue::TargetCustom:
867 return AMDGPUAS::CONSTANT_ADDRESS;
868 }
869 return AMDGPUAS::FLAT_ADDRESS;
870}
871
872//===----------------------------------------------------------------------===//
873// GCN Target Machine (SI+)
874//===----------------------------------------------------------------------===//
875
876GCNTargetMachine::GCNTargetMachine(const Target &T, const Triple &TT,
877 StringRef CPU, StringRef FS,
878 TargetOptions Options,
879 Optional<Reloc::Model> RM,
880 Optional<CodeModel::Model> CM,
881 CodeGenOpt::Level OL, bool JIT)
882 : AMDGPUTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL) {}
883
884const TargetSubtargetInfo *
885GCNTargetMachine::getSubtargetImpl(const Function &F) const {
886 StringRef GPU = getGPUName(F);
887 StringRef FS = getFeatureString(F);
888
889 SmallString<128> SubtargetKey(GPU);
890 SubtargetKey.append(FS);
891
892 auto &I = SubtargetMap[SubtargetKey];
893 if (!I) {
894 // This needs to be done before we create a new subtarget since any
895 // creation will depend on the TM and the code generation flags on the
896 // function that reside in TargetOptions.
897 resetTargetOptions(F);
898 I = std::make_unique<GCNSubtarget>(TargetTriple, GPU, FS, *this);
899 }
900
901 I->setScalarizeGlobalBehavior(ScalarizeGlobal);
902
903 return I.get();
904}
905
906TargetTransformInfo
907GCNTargetMachine::getTargetTransformInfo(const Function &F) const {
908 return TargetTransformInfo(GCNTTIImpl(this, F));
909}
910
911//===----------------------------------------------------------------------===//
912// AMDGPU Pass Setup
913//===----------------------------------------------------------------------===//
914
915std::unique_ptr<CSEConfigBase> llvm::AMDGPUPassConfig::getCSEConfig() const {
916 return getStandardCSEConfigForOpt(TM->getOptLevel());
917}
918
919namespace {
920
921class GCNPassConfig final : public AMDGPUPassConfig {
922public:
923 GCNPassConfig(LLVMTargetMachine &TM, PassManagerBase &PM)
924 : AMDGPUPassConfig(TM, PM) {
925 // It is necessary to know the register usage of the entire call graph. We
926 // allow calls without EnableAMDGPUFunctionCalls if they are marked
927 // noinline, so this is always required.
928 setRequiresCodeGenSCCOrder(true);
929 substitutePass(&PostRASchedulerID, &PostMachineSchedulerID);
930 }
931
932 GCNTargetMachine &getGCNTargetMachine() const {
933 return getTM<GCNTargetMachine>();
934 }
935
936 ScheduleDAGInstrs *
937 createMachineScheduler(MachineSchedContext *C) const override;
938
939 ScheduleDAGInstrs *
940 createPostMachineScheduler(MachineSchedContext *C) const override {
941 ScheduleDAGMI *DAG = new GCNPostScheduleDAGMILive(
942 C, std::make_unique<PostGenericScheduler>(C),
943 /*RemoveKillFlags=*/true);
944 const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>();
945 DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
946 if (ST.shouldClusterStores())
947 DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
948 DAG->addMutation(ST.createFillMFMAShadowMutation(DAG->TII));
949 DAG->addMutation(createIGroupLPDAGMutation());
950 if (isPassEnabled(EnableVOPD, CodeGenOpt::Less))
951 DAG->addMutation(createVOPDPairingMutation());
952 return DAG;
953 }
954
955 bool addPreISel() override;
956 void addMachineSSAOptimization() override;
957 bool addILPOpts() override;
958 bool addInstSelector() override;
959 bool addIRTranslator() override;
960 void addPreLegalizeMachineIR() override;
961 bool addLegalizeMachineIR() override;
962 void addPreRegBankSelect() override;
963 bool addRegBankSelect() override;
964 void addPreGlobalInstructionSelect() override;
965 bool addGlobalInstructionSelect() override;
966 void addFastRegAlloc() override;
967 void addOptimizedRegAlloc() override;
968
969 FunctionPass *createSGPRAllocPass(bool Optimized);
970 FunctionPass *createVGPRAllocPass(bool Optimized);
971 FunctionPass *createRegAllocPass(bool Optimized) override;
972
973 bool addRegAssignAndRewriteFast() override;
974 bool addRegAssignAndRewriteOptimized() override;
975
976 void addPreRegAlloc() override;
977 bool addPreRewrite() override;
978 void addPostRegAlloc() override;
979 void addPreSched2() override;
980 void addPreEmitPass() override;
981};
982
983} // end anonymous namespace
984
985AMDGPUPassConfig::AMDGPUPassConfig(LLVMTargetMachine &TM, PassManagerBase &PM)
986 : TargetPassConfig(TM, PM) {
987 // Exceptions and StackMaps are not supported, so these passes will never do
988 // anything.
989 disablePass(&StackMapLivenessID);
990 disablePass(&FuncletLayoutID);
991 // Garbage collection is not supported.
992 disablePass(&GCLoweringID);
993 disablePass(&ShadowStackGCLoweringID);
994}
995
996void AMDGPUPassConfig::addEarlyCSEOrGVNPass() {
997 if (getOptLevel() == CodeGenOpt::Aggressive)
998 addPass(createGVNPass());
999 else
1000 addPass(createEarlyCSEPass());
1001}
1002
1003void AMDGPUPassConfig::addStraightLineScalarOptimizationPasses() {
1004 addPass(createLICMPass());
1005 addPass(createSeparateConstOffsetFromGEPPass());
1006 // ReassociateGEPs exposes more opportunities for SLSR. See
1007 // the example in reassociate-geps-and-slsr.ll.
1008 addPass(createStraightLineStrengthReducePass());
1009 // SeparateConstOffsetFromGEP and SLSR creates common expressions which GVN or
1010 // EarlyCSE can reuse.
1011 addEarlyCSEOrGVNPass();
1012 // Run NaryReassociate after EarlyCSE/GVN to be more effective.
1013 addPass(createNaryReassociatePass());
1014 // NaryReassociate on GEPs creates redundant common expressions, so run
1015 // EarlyCSE after it.
1016 addPass(createEarlyCSEPass());
1017}
1018
1019void AMDGPUPassConfig::addIRPasses() {
1020 const AMDGPUTargetMachine &TM = getAMDGPUTargetMachine();
1021
1022 // There is no reason to run these.
1023 disablePass(&StackMapLivenessID);
1024 disablePass(&FuncletLayoutID);
1025 disablePass(&PatchableFunctionID);
1026
1027 addPass(createAMDGPUPrintfRuntimeBinding());
1028 addPass(createAMDGPUCtorDtorLoweringPass());
1029
1030 // A call to propagate attributes pass in the backend in case opt was not run.
1031 addPass(createAMDGPUPropagateAttributesEarlyPass(&TM));
1032
1033 addPass(createAMDGPULowerIntrinsicsPass());
1034
1035 // Function calls are not supported, so make sure we inline everything.
1036 addPass(createAMDGPUAlwaysInlinePass());
1037 addPass(createAlwaysInlinerLegacyPass());
1038 // We need to add the barrier noop pass, otherwise adding the function
1039 // inlining pass will cause all of the PassConfigs passes to be run
1040 // one function at a time, which means if we have a module with two
1041 // functions, then we will generate code for the first function
1042 // without ever running any passes on the second.
1043 addPass(createBarrierNoopPass());
1044
1045 // Handle uses of OpenCL image2d_t, image3d_t and sampler_t arguments.
1046 if (TM.getTargetTriple().getArch() == Triple::r600)
1047 addPass(createR600OpenCLImageTypeLoweringPass());
1048
1049 // Replace OpenCL enqueued block function pointers with global variables.
1050 addPass(createAMDGPUOpenCLEnqueuedBlockLoweringPass());
1051
1052 // Can increase LDS used by kernel so runs before PromoteAlloca
1053 if (EnableLowerModuleLDS) {
1054 // The pass "amdgpu-replace-lds-use-with-pointer" need to be run before the
1055 // pass "amdgpu-lower-module-lds", and also it required to be run only if
1056 // "amdgpu-lower-module-lds" pass is enabled.
1057 if (EnableLDSReplaceWithPointer)
1058 addPass(createAMDGPUReplaceLDSUseWithPointerPass());
1059
1060 addPass(createAMDGPULowerModuleLDSPass());
1061 }
1062
1063 if (TM.getOptLevel() > CodeGenOpt::None)
1064 addPass(createInferAddressSpacesPass());
1065
1066 addPass(createAtomicExpandPass());
1067
1068 if (TM.getOptLevel() > CodeGenOpt::None) {
1069 addPass(createAMDGPUPromoteAlloca());
1070
1071 if (EnableSROA)
1072 addPass(createSROAPass());
1073 if (isPassEnabled(EnableScalarIRPasses))
1074 addStraightLineScalarOptimizationPasses();
1075
1076 if (EnableAMDGPUAliasAnalysis) {
1077 addPass(createAMDGPUAAWrapperPass());
1078 addPass(createExternalAAWrapperPass([](Pass &P, Function &,
1079 AAResults &AAR) {
1080 if (auto *WrapperPass = P.getAnalysisIfAvailable<AMDGPUAAWrapperPass>())
1081 AAR.addAAResult(WrapperPass->getResult());
1082 }));
1083 }
1084
1085 if (TM.getTargetTriple().getArch() == Triple::amdgcn) {
1086 // TODO: May want to move later or split into an early and late one.
1087 addPass(createAMDGPUCodeGenPreparePass());
1088 }
1089 }
1090
1091 TargetPassConfig::addIRPasses();
1092
1093 // EarlyCSE is not always strong enough to clean up what LSR produces. For
1094 // example, GVN can combine
1095 //
1096 // %0 = add %a, %b
1097 // %1 = add %b, %a
1098 //
1099 // and
1100 //
1101 // %0 = shl nsw %a, 2
1102 // %1 = shl %a, 2
1103 //
1104 // but EarlyCSE can do neither of them.
1105 if (isPassEnabled(EnableScalarIRPasses))
1106 addEarlyCSEOrGVNPass();
1107}
1108
1109void AMDGPUPassConfig::addCodeGenPrepare() {
1110 if (TM->getTargetTriple().getArch() == Triple::amdgcn) {
1111 addPass(createAMDGPUAttributorPass());
1112
1113 // FIXME: This pass adds 2 hacky attributes that can be replaced with an
1114 // analysis, and should be removed.
1115 addPass(createAMDGPUAnnotateKernelFeaturesPass());
1116 }
1117
1118 if (TM->getTargetTriple().getArch() == Triple::amdgcn &&
1119 EnableLowerKernelArguments)
1120 addPass(createAMDGPULowerKernelArgumentsPass());
1121
1122 TargetPassConfig::addCodeGenPrepare();
1123
1124 if (isPassEnabled(EnableLoadStoreVectorizer))
1125 addPass(createLoadStoreVectorizerPass());
1126
1127 // LowerSwitch pass may introduce unreachable blocks that can
1128 // cause unexpected behavior for subsequent passes. Placing it
1129 // here seems better that these blocks would get cleaned up by
1130 // UnreachableBlockElim inserted next in the pass flow.
1131 addPass(createLowerSwitchPass());
1132}
1133
1134bool AMDGPUPassConfig::addPreISel() {
1135 if (TM->getOptLevel() > CodeGenOpt::None)
1136 addPass(createFlattenCFGPass());
1137 return false;
1138}
1139
1140bool AMDGPUPassConfig::addInstSelector() {
1141 addPass(createAMDGPUISelDag(&getAMDGPUTargetMachine(), getOptLevel()));
1142 return false;
1143}
1144
1145bool AMDGPUPassConfig::addGCPasses() {
1146 // Do nothing. GC is not supported.
1147 return false;
1148}
1149
1150llvm::ScheduleDAGInstrs *
1151AMDGPUPassConfig::createMachineScheduler(MachineSchedContext *C) const {
1152 const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>();
1153 ScheduleDAGMILive *DAG = createGenericSchedLive(C);
1154 DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
1155 if (ST.shouldClusterStores())
1156 DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
1157 return DAG;
1158}
1159
1160//===----------------------------------------------------------------------===//
1161// GCN Pass Setup
1162//===----------------------------------------------------------------------===//
1163
1164ScheduleDAGInstrs *GCNPassConfig::createMachineScheduler(
1165 MachineSchedContext *C) const {
1166 const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>();
1167 if (ST.enableSIScheduler())
1168 return createSIMachineScheduler(C);
1169
1170 if (EnableMaxIlpSchedStrategy)
1171 return createGCNMaxILPMachineScheduler(C);
1172
1173 return createGCNMaxOccupancyMachineScheduler(C);
1174}
1175
1176bool GCNPassConfig::addPreISel() {
1177 AMDGPUPassConfig::addPreISel();
1178
1179 if (TM->getOptLevel() > CodeGenOpt::None)
1180 addPass(createAMDGPULateCodeGenPreparePass());
1181
1182 if (isPassEnabled(EnableAtomicOptimizations, CodeGenOpt::Less)) {
1183 addPass(createAMDGPUAtomicOptimizerPass());
1184 }
1185
1186 if (TM->getOptLevel() > CodeGenOpt::None)
1187 addPass(createSinkingPass());
1188
1189 // Merge divergent exit nodes. StructurizeCFG won't recognize the multi-exit
1190 // regions formed by them.
1191 addPass(&AMDGPUUnifyDivergentExitNodesID);
1192 if (!LateCFGStructurize) {
1193 if (EnableStructurizerWorkarounds) {
1194 addPass(createFixIrreduciblePass());
1195 addPass(createUnifyLoopExitsPass());
1196 }
1197 addPass(createStructurizeCFGPass(false)); // true -> SkipUniformRegions
1198 }
1199 addPass(createAMDGPUAnnotateUniformValues());
1200 if (!LateCFGStructurize) {
1201 addPass(createSIAnnotateControlFlowPass());
1202 // TODO: Move this right after structurizeCFG to avoid extra divergence
1203 // analysis. This depends on stopping SIAnnotateControlFlow from making
1204 // control flow modifications.
1205 addPass(createAMDGPURewriteUndefForPHIPass());
1206 }
1207 addPass(createLCSSAPass());
1208
1209 if (TM->getOptLevel() > CodeGenOpt::Less)
1210 addPass(&AMDGPUPerfHintAnalysisID);
1211
1212 return false;
1213}
1214
1215void GCNPassConfig::addMachineSSAOptimization() {
1216 TargetPassConfig::addMachineSSAOptimization();
1217
1218 // We want to fold operands after PeepholeOptimizer has run (or as part of
1219 // it), because it will eliminate extra copies making it easier to fold the
1220 // real source operand. We want to eliminate dead instructions after, so that
1221 // we see fewer uses of the copies. We then need to clean up the dead
1222 // instructions leftover after the operands are folded as well.
1223 //
1224 // XXX - Can we get away without running DeadMachineInstructionElim again?
1225 addPass(&SIFoldOperandsID);
1226 if (EnableDPPCombine)
1227 addPass(&GCNDPPCombineID);
1228 addPass(&SILoadStoreOptimizerID);
1229 if (isPassEnabled(EnableSDWAPeephole)) {
1230 addPass(&SIPeepholeSDWAID);
1231 addPass(&EarlyMachineLICMID);
1232 addPass(&MachineCSEID);
1233 addPass(&SIFoldOperandsID);
1234 }
1235 addPass(&DeadMachineInstructionElimID);
1236 addPass(createSIShrinkInstructionsPass());
1237}
1238
1239bool GCNPassConfig::addILPOpts() {
1240 if (EnableEarlyIfConversion)
1241 addPass(&EarlyIfConverterID);
1242
1243 TargetPassConfig::addILPOpts();
1244 return false;
1245}
1246
1247bool GCNPassConfig::addInstSelector() {
1248 AMDGPUPassConfig::addInstSelector();
1249 addPass(&SIFixSGPRCopiesID);
1250 addPass(createSILowerI1CopiesPass());
1251 return false;
1252}
1253
1254bool GCNPassConfig::addIRTranslator() {
1255 addPass(new IRTranslator(getOptLevel()));
1256 return false;
1257}
1258
1259void GCNPassConfig::addPreLegalizeMachineIR() {
1260 bool IsOptNone = getOptLevel() == CodeGenOpt::None;
1261 addPass(createAMDGPUPreLegalizeCombiner(IsOptNone));
1262 addPass(new Localizer());
1263}
1264
1265bool GCNPassConfig::addLegalizeMachineIR() {
1266 addPass(new Legalizer());
1267 return false;
1268}
1269
1270void GCNPassConfig::addPreRegBankSelect() {
1271 bool IsOptNone = getOptLevel() == CodeGenOpt::None;
1272 addPass(createAMDGPUPostLegalizeCombiner(IsOptNone));
1273}
1274
1275bool GCNPassConfig::addRegBankSelect() {
1276 addPass(new RegBankSelect());
1277 return false;
1278}
1279
1280void GCNPassConfig::addPreGlobalInstructionSelect() {
1281 bool IsOptNone = getOptLevel() == CodeGenOpt::None;
1282 addPass(createAMDGPURegBankCombiner(IsOptNone));
1283}
1284
1285bool GCNPassConfig::addGlobalInstructionSelect() {
1286 addPass(new InstructionSelect(getOptLevel()));
1287 return false;
1288}
1289
1290void GCNPassConfig::addPreRegAlloc() {
1291 if (LateCFGStructurize) {
1292 addPass(createAMDGPUMachineCFGStructurizerPass());
1293 }
1294}
1295
1296void GCNPassConfig::addFastRegAlloc() {
1297 // FIXME: We have to disable the verifier here because of PHIElimination +
1298 // TwoAddressInstructions disabling it.
1299
1300 // This must be run immediately after phi elimination and before
1301 // TwoAddressInstructions, otherwise the processing of the tied operand of
1302 // SI_ELSE will introduce a copy of the tied operand source after the else.
1303 insertPass(&PHIEliminationID, &SILowerControlFlowID);
1304
1305 insertPass(&TwoAddressInstructionPassID, &SIWholeQuadModeID);
1306 insertPass(&TwoAddressInstructionPassID, &SIPreAllocateWWMRegsID);
1307
1308 TargetPassConfig::addFastRegAlloc();
1309}
1310
1311void GCNPassConfig::addOptimizedRegAlloc() {
1312 // Allow the scheduler to run before SIWholeQuadMode inserts exec manipulation
1313 // instructions that cause scheduling barriers.
1314 insertPass(&MachineSchedulerID, &SIWholeQuadModeID);
1315 insertPass(&MachineSchedulerID, &SIPreAllocateWWMRegsID);
1316
1317 if (OptExecMaskPreRA)
1318 insertPass(&MachineSchedulerID, &SIOptimizeExecMaskingPreRAID);
1319
1320 if (isPassEnabled(EnablePreRAOptimizations))
1321 insertPass(&RenameIndependentSubregsID, &GCNPreRAOptimizationsID);
1322
1323 // This is not an essential optimization and it has a noticeable impact on
1324 // compilation time, so we only enable it from O2.
1325 if (TM->getOptLevel() > CodeGenOpt::Less)
1326 insertPass(&MachineSchedulerID, &SIFormMemoryClausesID);
1327
1328 // FIXME: when an instruction has a Killed operand, and the instruction is
1329 // inside a bundle, seems only the BUNDLE instruction appears as the Kills of
1330 // the register in LiveVariables, this would trigger a failure in verifier,
1331 // we should fix it and enable the verifier.
1332 if (OptVGPRLiveRange)
1333 insertPass(&LiveVariablesID, &SIOptimizeVGPRLiveRangeID);
1334 // This must be run immediately after phi elimination and before
1335 // TwoAddressInstructions, otherwise the processing of the tied operand of
1336 // SI_ELSE will introduce a copy of the tied operand source after the else.
1337 insertPass(&PHIEliminationID, &SILowerControlFlowID);
1338
1339 if (EnableDCEInRA)
1340 insertPass(&DetectDeadLanesID, &DeadMachineInstructionElimID);
1341
1342 TargetPassConfig::addOptimizedRegAlloc();
1343}
1344
1345bool GCNPassConfig::addPreRewrite() {
1346 if (EnableRegReassign)
1347 addPass(&GCNNSAReassignID);
1348 return true;
1349}
1350
1351FunctionPass *GCNPassConfig::createSGPRAllocPass(bool Optimized) {
1352 // Initialize the global default.
1353 llvm::call_once(InitializeDefaultSGPRRegisterAllocatorFlag,
1354 initializeDefaultSGPRRegisterAllocatorOnce);
1355
1356 RegisterRegAlloc::FunctionPassCtor Ctor = SGPRRegisterRegAlloc::getDefault();
1357 if (Ctor != useDefaultRegisterAllocator)
1358 return Ctor();
1359
1360 if (Optimized)
1361 return createGreedyRegisterAllocator(onlyAllocateSGPRs);
1362
1363 return createFastRegisterAllocator(onlyAllocateSGPRs, false);
1364}
1365
1366FunctionPass *GCNPassConfig::createVGPRAllocPass(bool Optimized) {
1367 // Initialize the global default.
1368 llvm::call_once(InitializeDefaultVGPRRegisterAllocatorFlag,
1369 initializeDefaultVGPRRegisterAllocatorOnce);
1370
1371 RegisterRegAlloc::FunctionPassCtor Ctor = VGPRRegisterRegAlloc::getDefault();
1372 if (Ctor != useDefaultRegisterAllocator)
1373 return Ctor();
1374
1375 if (Optimized)
1376 return createGreedyVGPRRegisterAllocator();
1377
1378 return createFastVGPRRegisterAllocator();
1379}
1380
1381FunctionPass *GCNPassConfig::createRegAllocPass(bool Optimized) {
1382 llvm_unreachable("should not be used")::llvm::llvm_unreachable_internal("should not be used", "llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp"
, 1382)
;
1383}
1384
1385static const char RegAllocOptNotSupportedMessage[] =
1386 "-regalloc not supported with amdgcn. Use -sgpr-regalloc and -vgpr-regalloc";
1387
1388bool GCNPassConfig::addRegAssignAndRewriteFast() {
1389 if (!usingDefaultRegAlloc())
1390 report_fatal_error(RegAllocOptNotSupportedMessage);
1391
1392 addPass(createSGPRAllocPass(false));
1393
1394 // Equivalent of PEI for SGPRs.
1395 addPass(&SILowerSGPRSpillsID);
1396
1397 addPass(createVGPRAllocPass(false));
1398 return true;
1399}
1400
1401bool GCNPassConfig::addRegAssignAndRewriteOptimized() {
1402 if (!usingDefaultRegAlloc())
1403 report_fatal_error(RegAllocOptNotSupportedMessage);
1404
1405 addPass(createSGPRAllocPass(true));
1406
1407 // Commit allocated register changes. This is mostly necessary because too
1408 // many things rely on the use lists of the physical registers, such as the
1409 // verifier. This is only necessary with allocators which use LiveIntervals,
1410 // since FastRegAlloc does the replacements itself.
1411 addPass(createVirtRegRewriter(false));
1412
1413 // Equivalent of PEI for SGPRs.
1414 addPass(&SILowerSGPRSpillsID);
1415
1416 addPass(createVGPRAllocPass(true));
1417
1418 addPreRewrite();
1419 addPass(&VirtRegRewriterID);
1420
1421 return true;
1422}
1423
1424void GCNPassConfig::addPostRegAlloc() {
1425 addPass(&SIFixVGPRCopiesID);
1426 if (getOptLevel() > CodeGenOpt::None)
1427 addPass(&SIOptimizeExecMaskingID);
1428 TargetPassConfig::addPostRegAlloc();
1429}
1430
1431void GCNPassConfig::addPreSched2() {
1432 if (TM->getOptLevel() > CodeGenOpt::None)
1433 addPass(createSIShrinkInstructionsPass());
1434 addPass(&SIPostRABundlerID);
1435}
1436
1437void GCNPassConfig::addPreEmitPass() {
1438 if (isPassEnabled(EnableVOPD, CodeGenOpt::Less))
1439 addPass(&GCNCreateVOPDID);
1440 addPass(createSIMemoryLegalizerPass());
1441 addPass(createSIInsertWaitcntsPass());
1442
1443 addPass(createSIModeRegisterPass());
1444
1445 if (getOptLevel() > CodeGenOpt::None)
1446 addPass(&SIInsertHardClausesID);
1447
1448 addPass(&SILateBranchLoweringPassID);
1449 if (isPassEnabled(EnableSetWavePriority, CodeGenOpt::Less))
1450 addPass(createAMDGPUSetWavePriorityPass());
1451 if (getOptLevel() > CodeGenOpt::None)
1452 addPass(&SIPreEmitPeepholeID);
1453 // The hazard recognizer that runs as part of the post-ra scheduler does not
1454 // guarantee to be able handle all hazards correctly. This is because if there
1455 // are multiple scheduling regions in a basic block, the regions are scheduled
1456 // bottom up, so when we begin to schedule a region we don't know what
1457 // instructions were emitted directly before it.
1458 //
1459 // Here we add a stand-alone hazard recognizer pass which can handle all
1460 // cases.
1461 addPass(&PostRAHazardRecognizerID);
1462
1463 if (getOptLevel() > CodeGenOpt::Less)
1464 addPass(&AMDGPUReleaseVGPRsID);
1465
1466 if (isPassEnabled(EnableInsertDelayAlu, CodeGenOpt::Less))
1467 addPass(&AMDGPUInsertDelayAluID);
1468
1469 addPass(&BranchRelaxationPassID);
1470}
1471
1472TargetPassConfig *GCNTargetMachine::createPassConfig(PassManagerBase &PM) {
1473 return new GCNPassConfig(*this, PM);
1474}
1475
1476yaml::MachineFunctionInfo *GCNTargetMachine::createDefaultFuncInfoYAML() const {
1477 return new yaml::SIMachineFunctionInfo();
1478}
1479
1480yaml::MachineFunctionInfo *
1481GCNTargetMachine::convertFuncInfoToYAML(const MachineFunction &MF) const {
1482 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1483 return new yaml::SIMachineFunctionInfo(
1484 *MFI, *MF.getSubtarget().getRegisterInfo(), MF);
1485}
1486
1487bool GCNTargetMachine::parseMachineFunctionInfo(
1488 const yaml::MachineFunctionInfo &MFI_, PerFunctionMIParsingState &PFS,
1489 SMDiagnostic &Error, SMRange &SourceRange) const {
1490 const yaml::SIMachineFunctionInfo &YamlMFI =
1491 static_cast<const yaml::SIMachineFunctionInfo &>(MFI_);
1492 MachineFunction &MF = PFS.MF;
1493 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1494
1495 if (MFI->initializeBaseYamlFields(YamlMFI, MF, PFS, Error, SourceRange))
1496 return true;
1497
1498 if (MFI->Occupancy == 0) {
1499 // Fixup the subtarget dependent default value.
1500 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1501 MFI->Occupancy = ST.computeOccupancy(MF.getFunction(), MFI->getLDSSize());
1502 }
1503
1504 auto parseRegister = [&](const yaml::StringValue &RegName, Register &RegVal) {
1505 Register TempReg;
1506 if (parseNamedRegisterReference(PFS, TempReg, RegName.Value, Error)) {
1507 SourceRange = RegName.SourceRange;
1508 return true;
1509 }
1510 RegVal = TempReg;
1511
1512 return false;
1513 };
1514
1515 auto parseOptionalRegister = [&](const yaml::StringValue &RegName,
1516 Register &RegVal) {
1517 return !RegName.Value.empty() && parseRegister(RegName, RegVal);
1518 };
1519
1520 if (parseOptionalRegister(YamlMFI.VGPRForAGPRCopy, MFI->VGPRForAGPRCopy))
1521 return true;
1522
1523 auto diagnoseRegisterClass = [&](const yaml::StringValue &RegName) {
1524 // Create a diagnostic for a the register string literal.
1525 const MemoryBuffer &Buffer =
1526 *PFS.SM->getMemoryBuffer(PFS.SM->getMainFileID());
1527 Error = SMDiagnostic(*PFS.SM, SMLoc(), Buffer.getBufferIdentifier(), 1,
1528 RegName.Value.size(), SourceMgr::DK_Error,
1529 "incorrect register class for field", RegName.Value,
1530 None, None);
1531 SourceRange = RegName.SourceRange;
1532 return true;
1533 };
1534
1535 if (parseRegister(YamlMFI.ScratchRSrcReg, MFI->ScratchRSrcReg) ||
1536 parseRegister(YamlMFI.FrameOffsetReg, MFI->FrameOffsetReg) ||
1537 parseRegister(YamlMFI.StackPtrOffsetReg, MFI->StackPtrOffsetReg))
1538 return true;
1539
1540 if (MFI->ScratchRSrcReg != AMDGPU::PRIVATE_RSRC_REG &&
1541 !AMDGPU::SGPR_128RegClass.contains(MFI->ScratchRSrcReg)) {
1542 return diagnoseRegisterClass(YamlMFI.ScratchRSrcReg);
1543 }
1544
1545 if (MFI->FrameOffsetReg != AMDGPU::FP_REG &&
1546 !AMDGPU::SGPR_32RegClass.contains(MFI->FrameOffsetReg)) {
1547 return diagnoseRegisterClass(YamlMFI.FrameOffsetReg);
1548 }
1549
1550 if (MFI->StackPtrOffsetReg != AMDGPU::SP_REG &&
1551 !AMDGPU::SGPR_32RegClass.contains(MFI->StackPtrOffsetReg)) {
1552 return diagnoseRegisterClass(YamlMFI.StackPtrOffsetReg);
1553 }
1554
1555 for (const auto &YamlReg : YamlMFI.WWMReservedRegs) {
1556 Register ParsedReg;
1557 if (parseRegister(YamlReg, ParsedReg))
1558 return true;
1559
1560 MFI->reserveWWMRegister(ParsedReg);
1561 }
1562
1563 auto parseAndCheckArgument = [&](const Optional<yaml::SIArgument> &A,
1564 const TargetRegisterClass &RC,
1565 ArgDescriptor &Arg, unsigned UserSGPRs,
1566 unsigned SystemSGPRs) {
1567 // Skip parsing if it's not present.
1568 if (!A)
1569 return false;
1570
1571 if (A->IsRegister) {
1572 Register Reg;
1573 if (parseNamedRegisterReference(PFS, Reg, A->RegisterName.Value, Error)) {
1574 SourceRange = A->RegisterName.SourceRange;
1575 return true;
1576 }
1577 if (!RC.contains(Reg))
1578 return diagnoseRegisterClass(A->RegisterName);
1579 Arg = ArgDescriptor::createRegister(Reg);
1580 } else
1581 Arg = ArgDescriptor::createStack(A->StackOffset);
1582 // Check and apply the optional mask.
1583 if (A->Mask)
1584 Arg = ArgDescriptor::createArg(Arg, *A->Mask);
1585
1586 MFI->NumUserSGPRs += UserSGPRs;
1587 MFI->NumSystemSGPRs += SystemSGPRs;
1588 return false;
1589 };
1590
1591 if (YamlMFI.ArgInfo &&
1592 (parseAndCheckArgument(YamlMFI.ArgInfo->PrivateSegmentBuffer,
1593 AMDGPU::SGPR_128RegClass,
1594 MFI->ArgInfo.PrivateSegmentBuffer, 4, 0) ||
1595 parseAndCheckArgument(YamlMFI.ArgInfo->DispatchPtr,
1596 AMDGPU::SReg_64RegClass, MFI->ArgInfo.DispatchPtr,
1597 2, 0) ||
1598 parseAndCheckArgument(YamlMFI.ArgInfo->QueuePtr, AMDGPU::SReg_64RegClass,
1599 MFI->ArgInfo.QueuePtr, 2, 0) ||
1600 parseAndCheckArgument(YamlMFI.ArgInfo->KernargSegmentPtr,
1601 AMDGPU::SReg_64RegClass,
1602 MFI->ArgInfo.KernargSegmentPtr, 2, 0) ||
1603 parseAndCheckArgument(YamlMFI.ArgInfo->DispatchID,
1604 AMDGPU::SReg_64RegClass, MFI->ArgInfo.DispatchID,
1605 2, 0) ||
1606 parseAndCheckArgument(YamlMFI.ArgInfo->FlatScratchInit,
1607 AMDGPU::SReg_64RegClass,
1608 MFI->ArgInfo.FlatScratchInit, 2, 0) ||
1609 parseAndCheckArgument(YamlMFI.ArgInfo->PrivateSegmentSize,
1610 AMDGPU::SGPR_32RegClass,
1611 MFI->ArgInfo.PrivateSegmentSize, 0, 0) ||
1612 parseAndCheckArgument(YamlMFI.ArgInfo->LDSKernelId,
1613 AMDGPU::SGPR_32RegClass,
1614 MFI->ArgInfo.LDSKernelId, 0, 1) ||
1615 parseAndCheckArgument(YamlMFI.ArgInfo->WorkGroupIDX,
1616 AMDGPU::SGPR_32RegClass, MFI->ArgInfo.WorkGroupIDX,
1617 0, 1) ||
1618 parseAndCheckArgument(YamlMFI.ArgInfo->WorkGroupIDY,
1619 AMDGPU::SGPR_32RegClass, MFI->ArgInfo.WorkGroupIDY,
1620 0, 1) ||
1621 parseAndCheckArgument(YamlMFI.ArgInfo->WorkGroupIDZ,
1622 AMDGPU::SGPR_32RegClass, MFI->ArgInfo.WorkGroupIDZ,
1623 0, 1) ||
1624 parseAndCheckArgument(YamlMFI.ArgInfo->WorkGroupInfo,
1625 AMDGPU::SGPR_32RegClass,
1626 MFI->ArgInfo.WorkGroupInfo, 0, 1) ||
1627 parseAndCheckArgument(YamlMFI.ArgInfo->PrivateSegmentWaveByteOffset,
1628 AMDGPU::SGPR_32RegClass,
1629 MFI->ArgInfo.PrivateSegmentWaveByteOffset, 0, 1) ||
1630 parseAndCheckArgument(YamlMFI.ArgInfo->ImplicitArgPtr,
1631 AMDGPU::SReg_64RegClass,
1632 MFI->ArgInfo.ImplicitArgPtr, 0, 0) ||
1633 parseAndCheckArgument(YamlMFI.ArgInfo->ImplicitBufferPtr,
1634 AMDGPU::SReg_64RegClass,
1635 MFI->ArgInfo.ImplicitBufferPtr, 2, 0) ||
1636 parseAndCheckArgument(YamlMFI.ArgInfo->WorkItemIDX,
1637 AMDGPU::VGPR_32RegClass,
1638 MFI->ArgInfo.WorkItemIDX, 0, 0) ||
1639 parseAndCheckArgument(YamlMFI.ArgInfo->WorkItemIDY,
1640 AMDGPU::VGPR_32RegClass,
1641 MFI->ArgInfo.WorkItemIDY, 0, 0) ||
1642 parseAndCheckArgument(YamlMFI.ArgInfo->WorkItemIDZ,
1643 AMDGPU::VGPR_32RegClass,
1644 MFI->ArgInfo.WorkItemIDZ, 0, 0)))
1645 return true;
1646
1647 MFI->Mode.IEEE = YamlMFI.Mode.IEEE;
1648 MFI->Mode.DX10Clamp = YamlMFI.Mode.DX10Clamp;
1649 MFI->Mode.FP32InputDenormals = YamlMFI.Mode.FP32InputDenormals;
1650 MFI->Mode.FP32OutputDenormals = YamlMFI.Mode.FP32OutputDenormals;
1651 MFI->Mode.FP64FP16InputDenormals = YamlMFI.Mode.FP64FP16InputDenormals;
1652 MFI->Mode.FP64FP16OutputDenormals = YamlMFI.Mode.FP64FP16OutputDenormals;
1653
1654 return false;
1655}