Bug Summary

File:llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
Warning:line 110, column 5
Value stored to 'Ctor' is never read

Annotated Source Code

Press '?' to see keyboard shortcuts

clang -cc1 -cc1 -triple x86_64-pc-linux-gnu -analyze -disable-free -disable-llvm-verifier -discard-value-names -main-file-name AMDGPUTargetMachine.cpp -analyzer-store=region -analyzer-opt-analyze-nested-blocks -analyzer-checker=core -analyzer-checker=apiModeling -analyzer-checker=unix -analyzer-checker=deadcode -analyzer-checker=cplusplus -analyzer-checker=security.insecureAPI.UncheckedReturn -analyzer-checker=security.insecureAPI.getpw -analyzer-checker=security.insecureAPI.gets -analyzer-checker=security.insecureAPI.mktemp -analyzer-checker=security.insecureAPI.mkstemp -analyzer-checker=security.insecureAPI.vfork -analyzer-checker=nullability.NullPassedToNonnull -analyzer-checker=nullability.NullReturnedFromNonnull -analyzer-output plist -w -setup-static-analyzer -analyzer-config-compatibility-mode=true -mrelocation-model pic -pic-level 2 -mframe-pointer=none -fmath-errno -fno-rounding-math -mconstructor-aliases -munwind-tables -target-cpu x86-64 -tune-cpu generic -debugger-tuning=gdb -ffunction-sections -fdata-sections -fcoverage-compilation-dir=/build/llvm-toolchain-snapshot-14~++20210926122410+d23fd8ae8906/build-llvm -resource-dir /usr/lib/llvm-14/lib/clang/14.0.0 -D _DEBUG -D _GNU_SOURCE -D __STDC_CONSTANT_MACROS -D __STDC_FORMAT_MACROS -D __STDC_LIMIT_MACROS -I lib/Target/AMDGPU -I /build/llvm-toolchain-snapshot-14~++20210926122410+d23fd8ae8906/llvm/lib/Target/AMDGPU -I include -I /build/llvm-toolchain-snapshot-14~++20210926122410+d23fd8ae8906/llvm/include -D NDEBUG -U NDEBUG -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../include/c++/10 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../include/x86_64-linux-gnu/c++/10 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../include/c++/10/backward -internal-isystem /usr/lib/llvm-14/lib/clang/14.0.0/include -internal-isystem /usr/local/include -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../x86_64-linux-gnu/include -internal-externc-isystem /usr/include/x86_64-linux-gnu -internal-externc-isystem /include -internal-externc-isystem /usr/include -O2 -Wno-unused-command-line-argument -Wno-unknown-warning-option -Wno-unused-parameter -Wwrite-strings -Wno-missing-field-initializers -Wno-long-long -Wno-maybe-uninitialized -Wno-class-memaccess -Wno-redundant-move -Wno-pessimizing-move -Wno-noexcept-type -Wno-comment -std=c++14 -fdeprecated-macro -fdebug-compilation-dir=/build/llvm-toolchain-snapshot-14~++20210926122410+d23fd8ae8906/build-llvm -ferror-limit 19 -fvisibility hidden -fvisibility-inlines-hidden -fgnuc-version=4.2.1 -fcolor-diagnostics -vectorize-loops -vectorize-slp -analyzer-output=html -analyzer-config stable-report-filename=true -faddrsig -D__GCC_HAVE_DWARF2_CFI_ASM=1 -o /tmp/scan-build-2021-09-26-234817-15343-1 -x c++ /build/llvm-toolchain-snapshot-14~++20210926122410+d23fd8ae8906/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
1//===-- AMDGPUTargetMachine.cpp - TargetMachine for hw codegen targets-----===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// The AMDGPU target machine contains all of the hardware specific
11/// information needed to emit code for SI+ GPUs.
12//
13//===----------------------------------------------------------------------===//
14
15#include "AMDGPUTargetMachine.h"
16#include "AMDGPU.h"
17#include "AMDGPUAliasAnalysis.h"
18#include "AMDGPUExportClustering.h"
19#include "AMDGPUMacroFusion.h"
20#include "AMDGPUTargetObjectFile.h"
21#include "AMDGPUTargetTransformInfo.h"
22#include "GCNIterativeScheduler.h"
23#include "GCNSchedStrategy.h"
24#include "R600.h"
25#include "R600TargetMachine.h"
26#include "SIMachineFunctionInfo.h"
27#include "SIMachineScheduler.h"
28#include "TargetInfo/AMDGPUTargetInfo.h"
29#include "llvm/Analysis/CGSCCPassManager.h"
30#include "llvm/CodeGen/GlobalISel/IRTranslator.h"
31#include "llvm/CodeGen/GlobalISel/InstructionSelect.h"
32#include "llvm/CodeGen/GlobalISel/Legalizer.h"
33#include "llvm/CodeGen/GlobalISel/Localizer.h"
34#include "llvm/CodeGen/GlobalISel/RegBankSelect.h"
35#include "llvm/CodeGen/MIRParser/MIParser.h"
36#include "llvm/CodeGen/Passes.h"
37#include "llvm/CodeGen/RegAllocRegistry.h"
38#include "llvm/CodeGen/TargetPassConfig.h"
39#include "llvm/IR/LegacyPassManager.h"
40#include "llvm/IR/PassManager.h"
41#include "llvm/InitializePasses.h"
42#include "llvm/Passes/PassBuilder.h"
43#include "llvm/Support/TargetRegistry.h"
44#include "llvm/Transforms/IPO.h"
45#include "llvm/Transforms/IPO/AlwaysInliner.h"
46#include "llvm/Transforms/IPO/GlobalDCE.h"
47#include "llvm/Transforms/IPO/Internalize.h"
48#include "llvm/Transforms/IPO/PassManagerBuilder.h"
49#include "llvm/Transforms/Scalar.h"
50#include "llvm/Transforms/Scalar/GVN.h"
51#include "llvm/Transforms/Scalar/InferAddressSpaces.h"
52#include "llvm/Transforms/Utils.h"
53#include "llvm/Transforms/Utils/SimplifyLibCalls.h"
54#include "llvm/Transforms/Vectorize.h"
55
56using namespace llvm;
57
58namespace {
59class SGPRRegisterRegAlloc : public RegisterRegAllocBase<SGPRRegisterRegAlloc> {
60public:
61 SGPRRegisterRegAlloc(const char *N, const char *D, FunctionPassCtor C)
62 : RegisterRegAllocBase(N, D, C) {}
63};
64
65class VGPRRegisterRegAlloc : public RegisterRegAllocBase<VGPRRegisterRegAlloc> {
66public:
67 VGPRRegisterRegAlloc(const char *N, const char *D, FunctionPassCtor C)
68 : RegisterRegAllocBase(N, D, C) {}
69};
70
71static bool onlyAllocateSGPRs(const TargetRegisterInfo &TRI,
72 const TargetRegisterClass &RC) {
73 return static_cast<const SIRegisterInfo &>(TRI).isSGPRClass(&RC);
74}
75
76static bool onlyAllocateVGPRs(const TargetRegisterInfo &TRI,
77 const TargetRegisterClass &RC) {
78 return !static_cast<const SIRegisterInfo &>(TRI).isSGPRClass(&RC);
79}
80
81
82/// -{sgpr|vgpr}-regalloc=... command line option.
83static FunctionPass *useDefaultRegisterAllocator() { return nullptr; }
84
85/// A dummy default pass factory indicates whether the register allocator is
86/// overridden on the command line.
87static llvm::once_flag InitializeDefaultSGPRRegisterAllocatorFlag;
88static llvm::once_flag InitializeDefaultVGPRRegisterAllocatorFlag;
89
90static SGPRRegisterRegAlloc
91defaultSGPRRegAlloc("default",
92 "pick SGPR register allocator based on -O option",
93 useDefaultRegisterAllocator);
94
95static cl::opt<SGPRRegisterRegAlloc::FunctionPassCtor, false,
96 RegisterPassParser<SGPRRegisterRegAlloc>>
97SGPRRegAlloc("sgpr-regalloc", cl::Hidden, cl::init(&useDefaultRegisterAllocator),
98 cl::desc("Register allocator to use for SGPRs"));
99
100static cl::opt<VGPRRegisterRegAlloc::FunctionPassCtor, false,
101 RegisterPassParser<VGPRRegisterRegAlloc>>
102VGPRRegAlloc("vgpr-regalloc", cl::Hidden, cl::init(&useDefaultRegisterAllocator),
103 cl::desc("Register allocator to use for VGPRs"));
104
105
106static void initializeDefaultSGPRRegisterAllocatorOnce() {
107 RegisterRegAlloc::FunctionPassCtor Ctor = SGPRRegisterRegAlloc::getDefault();
108
109 if (!Ctor) {
110 Ctor = SGPRRegAlloc;
Value stored to 'Ctor' is never read
111 SGPRRegisterRegAlloc::setDefault(SGPRRegAlloc);
112 }
113}
114
115static void initializeDefaultVGPRRegisterAllocatorOnce() {
116 RegisterRegAlloc::FunctionPassCtor Ctor = VGPRRegisterRegAlloc::getDefault();
117
118 if (!Ctor) {
119 Ctor = VGPRRegAlloc;
120 VGPRRegisterRegAlloc::setDefault(VGPRRegAlloc);
121 }
122}
123
124static FunctionPass *createBasicSGPRRegisterAllocator() {
125 return createBasicRegisterAllocator(onlyAllocateSGPRs);
126}
127
128static FunctionPass *createGreedySGPRRegisterAllocator() {
129 return createGreedyRegisterAllocator(onlyAllocateSGPRs);
130}
131
132static FunctionPass *createFastSGPRRegisterAllocator() {
133 return createFastRegisterAllocator(onlyAllocateSGPRs, false);
134}
135
136static FunctionPass *createBasicVGPRRegisterAllocator() {
137 return createBasicRegisterAllocator(onlyAllocateVGPRs);
138}
139
140static FunctionPass *createGreedyVGPRRegisterAllocator() {
141 return createGreedyRegisterAllocator(onlyAllocateVGPRs);
142}
143
144static FunctionPass *createFastVGPRRegisterAllocator() {
145 return createFastRegisterAllocator(onlyAllocateVGPRs, true);
146}
147
148static SGPRRegisterRegAlloc basicRegAllocSGPR(
149 "basic", "basic register allocator", createBasicSGPRRegisterAllocator);
150static SGPRRegisterRegAlloc greedyRegAllocSGPR(
151 "greedy", "greedy register allocator", createGreedySGPRRegisterAllocator);
152
153static SGPRRegisterRegAlloc fastRegAllocSGPR(
154 "fast", "fast register allocator", createFastSGPRRegisterAllocator);
155
156
157static VGPRRegisterRegAlloc basicRegAllocVGPR(
158 "basic", "basic register allocator", createBasicVGPRRegisterAllocator);
159static VGPRRegisterRegAlloc greedyRegAllocVGPR(
160 "greedy", "greedy register allocator", createGreedyVGPRRegisterAllocator);
161
162static VGPRRegisterRegAlloc fastRegAllocVGPR(
163 "fast", "fast register allocator", createFastVGPRRegisterAllocator);
164}
165
166static cl::opt<bool> EnableSROA(
167 "amdgpu-sroa",
168 cl::desc("Run SROA after promote alloca pass"),
169 cl::ReallyHidden,
170 cl::init(true));
171
172static cl::opt<bool>
173EnableEarlyIfConversion("amdgpu-early-ifcvt", cl::Hidden,
174 cl::desc("Run early if-conversion"),
175 cl::init(false));
176
177static cl::opt<bool>
178OptExecMaskPreRA("amdgpu-opt-exec-mask-pre-ra", cl::Hidden,
179 cl::desc("Run pre-RA exec mask optimizations"),
180 cl::init(true));
181
182// Option to disable vectorizer for tests.
183static cl::opt<bool> EnableLoadStoreVectorizer(
184 "amdgpu-load-store-vectorizer",
185 cl::desc("Enable load store vectorizer"),
186 cl::init(true),
187 cl::Hidden);
188
189// Option to control global loads scalarization
190static cl::opt<bool> ScalarizeGlobal(
191 "amdgpu-scalarize-global-loads",
192 cl::desc("Enable global load scalarization"),
193 cl::init(true),
194 cl::Hidden);
195
196// Option to run internalize pass.
197static cl::opt<bool> InternalizeSymbols(
198 "amdgpu-internalize-symbols",
199 cl::desc("Enable elimination of non-kernel functions and unused globals"),
200 cl::init(false),
201 cl::Hidden);
202
203// Option to inline all early.
204static cl::opt<bool> EarlyInlineAll(
205 "amdgpu-early-inline-all",
206 cl::desc("Inline all functions early"),
207 cl::init(false),
208 cl::Hidden);
209
210static cl::opt<bool> EnableSDWAPeephole(
211 "amdgpu-sdwa-peephole",
212 cl::desc("Enable SDWA peepholer"),
213 cl::init(true));
214
215static cl::opt<bool> EnableDPPCombine(
216 "amdgpu-dpp-combine",
217 cl::desc("Enable DPP combiner"),
218 cl::init(true));
219
220// Enable address space based alias analysis
221static cl::opt<bool> EnableAMDGPUAliasAnalysis("enable-amdgpu-aa", cl::Hidden,
222 cl::desc("Enable AMDGPU Alias Analysis"),
223 cl::init(true));
224
225// Option to run late CFG structurizer
226static cl::opt<bool, true> LateCFGStructurize(
227 "amdgpu-late-structurize",
228 cl::desc("Enable late CFG structurization"),
229 cl::location(AMDGPUTargetMachine::EnableLateStructurizeCFG),
230 cl::Hidden);
231
232static cl::opt<bool, true> EnableAMDGPUFixedFunctionABIOpt(
233 "amdgpu-fixed-function-abi",
234 cl::desc("Enable all implicit function arguments"),
235 cl::location(AMDGPUTargetMachine::EnableFixedFunctionABI),
236 cl::init(false),
237 cl::Hidden);
238
239// Enable lib calls simplifications
240static cl::opt<bool> EnableLibCallSimplify(
241 "amdgpu-simplify-libcall",
242 cl::desc("Enable amdgpu library simplifications"),
243 cl::init(true),
244 cl::Hidden);
245
246static cl::opt<bool> EnableLowerKernelArguments(
247 "amdgpu-ir-lower-kernel-arguments",
248 cl::desc("Lower kernel argument loads in IR pass"),
249 cl::init(true),
250 cl::Hidden);
251
252static cl::opt<bool> EnableRegReassign(
253 "amdgpu-reassign-regs",
254 cl::desc("Enable register reassign optimizations on gfx10+"),
255 cl::init(true),
256 cl::Hidden);
257
258static cl::opt<bool> OptVGPRLiveRange(
259 "amdgpu-opt-vgpr-liverange",
260 cl::desc("Enable VGPR liverange optimizations for if-else structure"),
261 cl::init(true), cl::Hidden);
262
263// Enable atomic optimization
264static cl::opt<bool> EnableAtomicOptimizations(
265 "amdgpu-atomic-optimizations",
266 cl::desc("Enable atomic optimizations"),
267 cl::init(false),
268 cl::Hidden);
269
270// Enable Mode register optimization
271static cl::opt<bool> EnableSIModeRegisterPass(
272 "amdgpu-mode-register",
273 cl::desc("Enable mode register pass"),
274 cl::init(true),
275 cl::Hidden);
276
277// Option is used in lit tests to prevent deadcoding of patterns inspected.
278static cl::opt<bool>
279EnableDCEInRA("amdgpu-dce-in-ra",
280 cl::init(true), cl::Hidden,
281 cl::desc("Enable machine DCE inside regalloc"));
282
283static cl::opt<bool> EnableScalarIRPasses(
284 "amdgpu-scalar-ir-passes",
285 cl::desc("Enable scalar IR passes"),
286 cl::init(true),
287 cl::Hidden);
288
289static cl::opt<bool> EnableStructurizerWorkarounds(
290 "amdgpu-enable-structurizer-workarounds",
291 cl::desc("Enable workarounds for the StructurizeCFG pass"), cl::init(true),
292 cl::Hidden);
293
294static cl::opt<bool> EnableLDSReplaceWithPointer(
295 "amdgpu-enable-lds-replace-with-pointer",
296 cl::desc("Enable LDS replace with pointer pass"), cl::init(false),
297 cl::Hidden);
298
299static cl::opt<bool, true> EnableLowerModuleLDS(
300 "amdgpu-enable-lower-module-lds", cl::desc("Enable lower module lds pass"),
301 cl::location(AMDGPUTargetMachine::EnableLowerModuleLDS), cl::init(true),
302 cl::Hidden);
303
304static cl::opt<bool> EnablePreRAOptimizations(
305 "amdgpu-enable-pre-ra-optimizations",
306 cl::desc("Enable Pre-RA optimizations pass"), cl::init(true),
307 cl::Hidden);
308
309extern "C" LLVM_EXTERNAL_VISIBILITY__attribute__ ((visibility("default"))) void LLVMInitializeAMDGPUTarget() {
310 // Register the target
311 RegisterTargetMachine<R600TargetMachine> X(getTheAMDGPUTarget());
312 RegisterTargetMachine<GCNTargetMachine> Y(getTheGCNTarget());
313
314 PassRegistry *PR = PassRegistry::getPassRegistry();
315 initializeR600ClauseMergePassPass(*PR);
316 initializeR600ControlFlowFinalizerPass(*PR);
317 initializeR600PacketizerPass(*PR);
318 initializeR600ExpandSpecialInstrsPassPass(*PR);
319 initializeR600VectorRegMergerPass(*PR);
320 initializeGlobalISel(*PR);
321 initializeAMDGPUDAGToDAGISelPass(*PR);
322 initializeGCNDPPCombinePass(*PR);
323 initializeSILowerI1CopiesPass(*PR);
324 initializeSILowerSGPRSpillsPass(*PR);
325 initializeSIFixSGPRCopiesPass(*PR);
326 initializeSIFixVGPRCopiesPass(*PR);
327 initializeSIFoldOperandsPass(*PR);
328 initializeSIPeepholeSDWAPass(*PR);
329 initializeSIShrinkInstructionsPass(*PR);
330 initializeSIOptimizeExecMaskingPreRAPass(*PR);
331 initializeSIOptimizeVGPRLiveRangePass(*PR);
332 initializeSILoadStoreOptimizerPass(*PR);
333 initializeAMDGPUFixFunctionBitcastsPass(*PR);
334 initializeAMDGPUCtorDtorLoweringPass(*PR);
335 initializeAMDGPUAlwaysInlinePass(*PR);
336 initializeAMDGPUAttributorPass(*PR);
337 initializeAMDGPUAnnotateKernelFeaturesPass(*PR);
338 initializeAMDGPUAnnotateUniformValuesPass(*PR);
339 initializeAMDGPUArgumentUsageInfoPass(*PR);
340 initializeAMDGPUAtomicOptimizerPass(*PR);
341 initializeAMDGPULowerKernelArgumentsPass(*PR);
342 initializeAMDGPULowerKernelAttributesPass(*PR);
343 initializeAMDGPULowerIntrinsicsPass(*PR);
344 initializeAMDGPUOpenCLEnqueuedBlockLoweringPass(*PR);
345 initializeAMDGPUPostLegalizerCombinerPass(*PR);
346 initializeAMDGPUPreLegalizerCombinerPass(*PR);
347 initializeAMDGPURegBankCombinerPass(*PR);
348 initializeAMDGPUPromoteAllocaPass(*PR);
349 initializeAMDGPUPromoteAllocaToVectorPass(*PR);
350 initializeAMDGPUCodeGenPreparePass(*PR);
351 initializeAMDGPULateCodeGenPreparePass(*PR);
352 initializeAMDGPUPropagateAttributesEarlyPass(*PR);
353 initializeAMDGPUPropagateAttributesLatePass(*PR);
354 initializeAMDGPUReplaceLDSUseWithPointerPass(*PR);
355 initializeAMDGPULowerModuleLDSPass(*PR);
356 initializeAMDGPURewriteOutArgumentsPass(*PR);
357 initializeAMDGPUUnifyMetadataPass(*PR);
358 initializeSIAnnotateControlFlowPass(*PR);
359 initializeSIInsertHardClausesPass(*PR);
360 initializeSIInsertWaitcntsPass(*PR);
361 initializeSIModeRegisterPass(*PR);
362 initializeSIWholeQuadModePass(*PR);
363 initializeSILowerControlFlowPass(*PR);
364 initializeSIPreEmitPeepholePass(*PR);
365 initializeSILateBranchLoweringPass(*PR);
366 initializeSIMemoryLegalizerPass(*PR);
367 initializeSIOptimizeExecMaskingPass(*PR);
368 initializeSIPreAllocateWWMRegsPass(*PR);
369 initializeSIFormMemoryClausesPass(*PR);
370 initializeSIPostRABundlerPass(*PR);
371 initializeAMDGPUUnifyDivergentExitNodesPass(*PR);
372 initializeAMDGPUAAWrapperPassPass(*PR);
373 initializeAMDGPUExternalAAWrapperPass(*PR);
374 initializeAMDGPUUseNativeCallsPass(*PR);
375 initializeAMDGPUSimplifyLibCallsPass(*PR);
376 initializeAMDGPUPrintfRuntimeBindingPass(*PR);
377 initializeAMDGPUResourceUsageAnalysisPass(*PR);
378 initializeGCNNSAReassignPass(*PR);
379 initializeGCNPreRAOptimizationsPass(*PR);
380}
381
382static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) {
383 return std::make_unique<AMDGPUTargetObjectFile>();
384}
385
386static ScheduleDAGInstrs *createSIMachineScheduler(MachineSchedContext *C) {
387 return new SIScheduleDAGMI(C);
388}
389
390static ScheduleDAGInstrs *
391createGCNMaxOccupancyMachineScheduler(MachineSchedContext *C) {
392 ScheduleDAGMILive *DAG =
393 new GCNScheduleDAGMILive(C, std::make_unique<GCNMaxOccupancySchedStrategy>(C));
394 DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
395 DAG->addMutation(createAMDGPUMacroFusionDAGMutation());
396 DAG->addMutation(createAMDGPUExportClusteringDAGMutation());
397 return DAG;
398}
399
400static ScheduleDAGInstrs *
401createIterativeGCNMaxOccupancyMachineScheduler(MachineSchedContext *C) {
402 auto DAG = new GCNIterativeScheduler(C,
403 GCNIterativeScheduler::SCHEDULE_LEGACYMAXOCCUPANCY);
404 DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
405 return DAG;
406}
407
408static ScheduleDAGInstrs *createMinRegScheduler(MachineSchedContext *C) {
409 return new GCNIterativeScheduler(C,
410 GCNIterativeScheduler::SCHEDULE_MINREGFORCED);
411}
412
413static ScheduleDAGInstrs *
414createIterativeILPMachineScheduler(MachineSchedContext *C) {
415 auto DAG = new GCNIterativeScheduler(C,
416 GCNIterativeScheduler::SCHEDULE_ILP);
417 DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
418 DAG->addMutation(createAMDGPUMacroFusionDAGMutation());
419 return DAG;
420}
421
422static MachineSchedRegistry
423SISchedRegistry("si", "Run SI's custom scheduler",
424 createSIMachineScheduler);
425
426static MachineSchedRegistry
427GCNMaxOccupancySchedRegistry("gcn-max-occupancy",
428 "Run GCN scheduler to maximize occupancy",
429 createGCNMaxOccupancyMachineScheduler);
430
431static MachineSchedRegistry
432IterativeGCNMaxOccupancySchedRegistry("gcn-max-occupancy-experimental",
433 "Run GCN scheduler to maximize occupancy (experimental)",
434 createIterativeGCNMaxOccupancyMachineScheduler);
435
436static MachineSchedRegistry
437GCNMinRegSchedRegistry("gcn-minreg",
438 "Run GCN iterative scheduler for minimal register usage (experimental)",
439 createMinRegScheduler);
440
441static MachineSchedRegistry
442GCNILPSchedRegistry("gcn-ilp",
443 "Run GCN iterative scheduler for ILP scheduling (experimental)",
444 createIterativeILPMachineScheduler);
445
446static StringRef computeDataLayout(const Triple &TT) {
447 if (TT.getArch() == Triple::r600) {
448 // 32-bit pointers.
449 return "e-p:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128"
450 "-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1";
451 }
452
453 // 32-bit private, local, and region pointers. 64-bit global, constant and
454 // flat, non-integral buffer fat pointers.
455 return "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32"
456 "-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128"
457 "-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1"
458 "-ni:7";
459}
460
461LLVM_READNONE__attribute__((__const__))
462static StringRef getGPUOrDefault(const Triple &TT, StringRef GPU) {
463 if (!GPU.empty())
464 return GPU;
465
466 // Need to default to a target with flat support for HSA.
467 if (TT.getArch() == Triple::amdgcn)
468 return TT.getOS() == Triple::AMDHSA ? "generic-hsa" : "generic";
469
470 return "r600";
471}
472
473static Reloc::Model getEffectiveRelocModel(Optional<Reloc::Model> RM) {
474 // The AMDGPU toolchain only supports generating shared objects, so we
475 // must always use PIC.
476 return Reloc::PIC_;
477}
478
479AMDGPUTargetMachine::AMDGPUTargetMachine(const Target &T, const Triple &TT,
480 StringRef CPU, StringRef FS,
481 TargetOptions Options,
482 Optional<Reloc::Model> RM,
483 Optional<CodeModel::Model> CM,
484 CodeGenOpt::Level OptLevel)
485 : LLVMTargetMachine(T, computeDataLayout(TT), TT, getGPUOrDefault(TT, CPU),
486 FS, Options, getEffectiveRelocModel(RM),
487 getEffectiveCodeModel(CM, CodeModel::Small), OptLevel),
488 TLOF(createTLOF(getTargetTriple())) {
489 initAsmInfo();
490 if (TT.getArch() == Triple::amdgcn) {
491 if (getMCSubtargetInfo()->checkFeatures("+wavefrontsize64"))
492 MRI.reset(llvm::createGCNMCRegisterInfo(AMDGPUDwarfFlavour::Wave64));
493 else if (getMCSubtargetInfo()->checkFeatures("+wavefrontsize32"))
494 MRI.reset(llvm::createGCNMCRegisterInfo(AMDGPUDwarfFlavour::Wave32));
495 }
496}
497
498bool AMDGPUTargetMachine::EnableLateStructurizeCFG = false;
499bool AMDGPUTargetMachine::EnableFunctionCalls = false;
500bool AMDGPUTargetMachine::EnableFixedFunctionABI = false;
501bool AMDGPUTargetMachine::EnableLowerModuleLDS = true;
502
503AMDGPUTargetMachine::~AMDGPUTargetMachine() = default;
504
505StringRef AMDGPUTargetMachine::getGPUName(const Function &F) const {
506 Attribute GPUAttr = F.getFnAttribute("target-cpu");
507 return GPUAttr.isValid() ? GPUAttr.getValueAsString() : getTargetCPU();
508}
509
510StringRef AMDGPUTargetMachine::getFeatureString(const Function &F) const {
511 Attribute FSAttr = F.getFnAttribute("target-features");
512
513 return FSAttr.isValid() ? FSAttr.getValueAsString()
514 : getTargetFeatureString();
515}
516
517/// Predicate for Internalize pass.
518static bool mustPreserveGV(const GlobalValue &GV) {
519 if (const Function *F = dyn_cast<Function>(&GV))
520 return F->isDeclaration() || AMDGPU::isEntryFunctionCC(F->getCallingConv());
521
522 GV.removeDeadConstantUsers();
523 return !GV.use_empty();
524}
525
526void AMDGPUTargetMachine::adjustPassManager(PassManagerBuilder &Builder) {
527 Builder.DivergentTarget = true;
528
529 bool EnableOpt = getOptLevel() > CodeGenOpt::None;
530 bool Internalize = InternalizeSymbols;
531 bool EarlyInline = EarlyInlineAll && EnableOpt && !EnableFunctionCalls;
532 bool AMDGPUAA = EnableAMDGPUAliasAnalysis && EnableOpt;
533 bool LibCallSimplify = EnableLibCallSimplify && EnableOpt;
534
535 if (EnableFunctionCalls) {
536 delete Builder.Inliner;
537 Builder.Inliner = createFunctionInliningPass();
538 }
539
540 Builder.addExtension(
541 PassManagerBuilder::EP_ModuleOptimizerEarly,
542 [Internalize, EarlyInline, AMDGPUAA, this](const PassManagerBuilder &,
543 legacy::PassManagerBase &PM) {
544 if (AMDGPUAA) {
545 PM.add(createAMDGPUAAWrapperPass());
546 PM.add(createAMDGPUExternalAAWrapperPass());
547 }
548 PM.add(createAMDGPUUnifyMetadataPass());
549 PM.add(createAMDGPUPrintfRuntimeBinding());
550 if (Internalize)
551 PM.add(createInternalizePass(mustPreserveGV));
552 PM.add(createAMDGPUPropagateAttributesLatePass(this));
553 if (Internalize)
554 PM.add(createGlobalDCEPass());
555 if (EarlyInline)
556 PM.add(createAMDGPUAlwaysInlinePass(false));
557 });
558
559 Builder.addExtension(
560 PassManagerBuilder::EP_EarlyAsPossible,
561 [AMDGPUAA, LibCallSimplify, this](const PassManagerBuilder &,
562 legacy::PassManagerBase &PM) {
563 if (AMDGPUAA) {
564 PM.add(createAMDGPUAAWrapperPass());
565 PM.add(createAMDGPUExternalAAWrapperPass());
566 }
567 PM.add(llvm::createAMDGPUPropagateAttributesEarlyPass(this));
568 PM.add(llvm::createAMDGPUUseNativeCallsPass());
569 if (LibCallSimplify)
570 PM.add(llvm::createAMDGPUSimplifyLibCallsPass(this));
571 });
572
573 Builder.addExtension(
574 PassManagerBuilder::EP_CGSCCOptimizerLate,
575 [EnableOpt](const PassManagerBuilder &, legacy::PassManagerBase &PM) {
576 // Add infer address spaces pass to the opt pipeline after inlining
577 // but before SROA to increase SROA opportunities.
578 PM.add(createInferAddressSpacesPass());
579
580 // This should run after inlining to have any chance of doing anything,
581 // and before other cleanup optimizations.
582 PM.add(createAMDGPULowerKernelAttributesPass());
583
584 // Promote alloca to vector before SROA and loop unroll. If we manage
585 // to eliminate allocas before unroll we may choose to unroll less.
586 if (EnableOpt)
587 PM.add(createAMDGPUPromoteAllocaToVector());
588 });
589}
590
591void AMDGPUTargetMachine::registerDefaultAliasAnalyses(AAManager &AAM) {
592 AAM.registerFunctionAnalysis<AMDGPUAA>();
593}
594
595void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) {
596 PB.registerPipelineParsingCallback(
597 [this](StringRef PassName, ModulePassManager &PM,
598 ArrayRef<PassBuilder::PipelineElement>) {
599 if (PassName == "amdgpu-propagate-attributes-late") {
600 PM.addPass(AMDGPUPropagateAttributesLatePass(*this));
601 return true;
602 }
603 if (PassName == "amdgpu-unify-metadata") {
604 PM.addPass(AMDGPUUnifyMetadataPass());
605 return true;
606 }
607 if (PassName == "amdgpu-printf-runtime-binding") {
608 PM.addPass(AMDGPUPrintfRuntimeBindingPass());
609 return true;
610 }
611 if (PassName == "amdgpu-always-inline") {
612 PM.addPass(AMDGPUAlwaysInlinePass());
613 return true;
614 }
615 if (PassName == "amdgpu-replace-lds-use-with-pointer") {
616 PM.addPass(AMDGPUReplaceLDSUseWithPointerPass());
617 return true;
618 }
619 if (PassName == "amdgpu-lower-module-lds") {
620 PM.addPass(AMDGPULowerModuleLDSPass());
621 return true;
622 }
623 return false;
624 });
625 PB.registerPipelineParsingCallback(
626 [this](StringRef PassName, FunctionPassManager &PM,
627 ArrayRef<PassBuilder::PipelineElement>) {
628 if (PassName == "amdgpu-simplifylib") {
629 PM.addPass(AMDGPUSimplifyLibCallsPass(*this));
630 return true;
631 }
632 if (PassName == "amdgpu-usenative") {
633 PM.addPass(AMDGPUUseNativeCallsPass());
634 return true;
635 }
636 if (PassName == "amdgpu-promote-alloca") {
637 PM.addPass(AMDGPUPromoteAllocaPass(*this));
638 return true;
639 }
640 if (PassName == "amdgpu-promote-alloca-to-vector") {
641 PM.addPass(AMDGPUPromoteAllocaToVectorPass(*this));
642 return true;
643 }
644 if (PassName == "amdgpu-lower-kernel-attributes") {
645 PM.addPass(AMDGPULowerKernelAttributesPass());
646 return true;
647 }
648 if (PassName == "amdgpu-propagate-attributes-early") {
649 PM.addPass(AMDGPUPropagateAttributesEarlyPass(*this));
650 return true;
651 }
652 return false;
653 });
654
655 PB.registerAnalysisRegistrationCallback([](FunctionAnalysisManager &FAM) {
656 FAM.registerPass([&] { return AMDGPUAA(); });
657 });
658
659 PB.registerParseAACallback([](StringRef AAName, AAManager &AAM) {
660 if (AAName == "amdgpu-aa") {
661 AAM.registerFunctionAnalysis<AMDGPUAA>();
662 return true;
663 }
664 return false;
665 });
666
667 PB.registerPipelineStartEPCallback(
668 [this](ModulePassManager &PM, OptimizationLevel Level) {
669 FunctionPassManager FPM;
670 FPM.addPass(AMDGPUPropagateAttributesEarlyPass(*this));
671 FPM.addPass(AMDGPUUseNativeCallsPass());
672 if (EnableLibCallSimplify && Level != OptimizationLevel::O0)
673 FPM.addPass(AMDGPUSimplifyLibCallsPass(*this));
674 PM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM)));
675 });
676
677 PB.registerPipelineEarlySimplificationEPCallback(
678 [this](ModulePassManager &PM, OptimizationLevel Level) {
679 if (Level == OptimizationLevel::O0)
680 return;
681
682 PM.addPass(AMDGPUUnifyMetadataPass());
683 PM.addPass(AMDGPUPrintfRuntimeBindingPass());
684
685 if (InternalizeSymbols) {
686 PM.addPass(InternalizePass(mustPreserveGV));
687 }
688 PM.addPass(AMDGPUPropagateAttributesLatePass(*this));
689 if (InternalizeSymbols) {
690 PM.addPass(GlobalDCEPass());
691 }
692 if (EarlyInlineAll && !EnableFunctionCalls)
693 PM.addPass(AMDGPUAlwaysInlinePass());
694 });
695
696 PB.registerCGSCCOptimizerLateEPCallback(
697 [this](CGSCCPassManager &PM, OptimizationLevel Level) {
698 if (Level == OptimizationLevel::O0)
699 return;
700
701 FunctionPassManager FPM;
702
703 // Add infer address spaces pass to the opt pipeline after inlining
704 // but before SROA to increase SROA opportunities.
705 FPM.addPass(InferAddressSpacesPass());
706
707 // This should run after inlining to have any chance of doing
708 // anything, and before other cleanup optimizations.
709 FPM.addPass(AMDGPULowerKernelAttributesPass());
710
711 if (Level != OptimizationLevel::O0) {
712 // Promote alloca to vector before SROA and loop unroll. If we
713 // manage to eliminate allocas before unroll we may choose to unroll
714 // less.
715 FPM.addPass(AMDGPUPromoteAllocaToVectorPass(*this));
716 }
717
718 PM.addPass(createCGSCCToFunctionPassAdaptor(std::move(FPM)));
719 });
720}
721
722int64_t AMDGPUTargetMachine::getNullPointerValue(unsigned AddrSpace) {
723 return (AddrSpace == AMDGPUAS::LOCAL_ADDRESS ||
724 AddrSpace == AMDGPUAS::PRIVATE_ADDRESS ||
725 AddrSpace == AMDGPUAS::REGION_ADDRESS)
726 ? -1
727 : 0;
728}
729
730bool AMDGPUTargetMachine::isNoopAddrSpaceCast(unsigned SrcAS,
731 unsigned DestAS) const {
732 return AMDGPU::isFlatGlobalAddrSpace(SrcAS) &&
733 AMDGPU::isFlatGlobalAddrSpace(DestAS);
734}
735
736unsigned AMDGPUTargetMachine::getAssumedAddrSpace(const Value *V) const {
737 const auto *LD = dyn_cast<LoadInst>(V);
738 if (!LD)
739 return AMDGPUAS::UNKNOWN_ADDRESS_SPACE;
740
741 // It must be a generic pointer loaded.
742 assert(V->getType()->isPointerTy() &&(static_cast <bool> (V->getType()->isPointerTy() &&
V->getType()->getPointerAddressSpace() == AMDGPUAS::FLAT_ADDRESS
) ? void (0) : __assert_fail ("V->getType()->isPointerTy() && V->getType()->getPointerAddressSpace() == AMDGPUAS::FLAT_ADDRESS"
, "/build/llvm-toolchain-snapshot-14~++20210926122410+d23fd8ae8906/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp"
, 743, __extension__ __PRETTY_FUNCTION__))
743 V->getType()->getPointerAddressSpace() == AMDGPUAS::FLAT_ADDRESS)(static_cast <bool> (V->getType()->isPointerTy() &&
V->getType()->getPointerAddressSpace() == AMDGPUAS::FLAT_ADDRESS
) ? void (0) : __assert_fail ("V->getType()->isPointerTy() && V->getType()->getPointerAddressSpace() == AMDGPUAS::FLAT_ADDRESS"
, "/build/llvm-toolchain-snapshot-14~++20210926122410+d23fd8ae8906/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp"
, 743, __extension__ __PRETTY_FUNCTION__))
;
744
745 const auto *Ptr = LD->getPointerOperand();
746 if (Ptr->getType()->getPointerAddressSpace() != AMDGPUAS::CONSTANT_ADDRESS)
747 return AMDGPUAS::UNKNOWN_ADDRESS_SPACE;
748 // For a generic pointer loaded from the constant memory, it could be assumed
749 // as a global pointer since the constant memory is only populated on the
750 // host side. As implied by the offload programming model, only global
751 // pointers could be referenced on the host side.
752 return AMDGPUAS::GLOBAL_ADDRESS;
753}
754
755//===----------------------------------------------------------------------===//
756// GCN Target Machine (SI+)
757//===----------------------------------------------------------------------===//
758
759GCNTargetMachine::GCNTargetMachine(const Target &T, const Triple &TT,
760 StringRef CPU, StringRef FS,
761 TargetOptions Options,
762 Optional<Reloc::Model> RM,
763 Optional<CodeModel::Model> CM,
764 CodeGenOpt::Level OL, bool JIT)
765 : AMDGPUTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL) {}
766
767const TargetSubtargetInfo *
768GCNTargetMachine::getSubtargetImpl(const Function &F) const {
769 StringRef GPU = getGPUName(F);
770 StringRef FS = getFeatureString(F);
771
772 SmallString<128> SubtargetKey(GPU);
773 SubtargetKey.append(FS);
774
775 auto &I = SubtargetMap[SubtargetKey];
776 if (!I) {
777 // This needs to be done before we create a new subtarget since any
778 // creation will depend on the TM and the code generation flags on the
779 // function that reside in TargetOptions.
780 resetTargetOptions(F);
781 I = std::make_unique<GCNSubtarget>(TargetTriple, GPU, FS, *this);
782 }
783
784 I->setScalarizeGlobalBehavior(ScalarizeGlobal);
785
786 return I.get();
787}
788
789TargetTransformInfo
790GCNTargetMachine::getTargetTransformInfo(const Function &F) {
791 return TargetTransformInfo(GCNTTIImpl(this, F));
792}
793
794//===----------------------------------------------------------------------===//
795// AMDGPU Pass Setup
796//===----------------------------------------------------------------------===//
797
798std::unique_ptr<CSEConfigBase> llvm::AMDGPUPassConfig::getCSEConfig() const {
799 return getStandardCSEConfigForOpt(TM->getOptLevel());
800}
801
802namespace {
803
804class GCNPassConfig final : public AMDGPUPassConfig {
805public:
806 GCNPassConfig(LLVMTargetMachine &TM, PassManagerBase &PM)
807 : AMDGPUPassConfig(TM, PM) {
808 // It is necessary to know the register usage of the entire call graph. We
809 // allow calls without EnableAMDGPUFunctionCalls if they are marked
810 // noinline, so this is always required.
811 setRequiresCodeGenSCCOrder(true);
812 substitutePass(&PostRASchedulerID, &PostMachineSchedulerID);
813 }
814
815 GCNTargetMachine &getGCNTargetMachine() const {
816 return getTM<GCNTargetMachine>();
817 }
818
819 ScheduleDAGInstrs *
820 createMachineScheduler(MachineSchedContext *C) const override;
821
822 ScheduleDAGInstrs *
823 createPostMachineScheduler(MachineSchedContext *C) const override {
824 ScheduleDAGMI *DAG = createGenericSchedPostRA(C);
825 const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>();
826 DAG->addMutation(ST.createFillMFMAShadowMutation(DAG->TII));
827 return DAG;
828 }
829
830 bool addPreISel() override;
831 void addMachineSSAOptimization() override;
832 bool addILPOpts() override;
833 bool addInstSelector() override;
834 bool addIRTranslator() override;
835 void addPreLegalizeMachineIR() override;
836 bool addLegalizeMachineIR() override;
837 void addPreRegBankSelect() override;
838 bool addRegBankSelect() override;
839 void addPreGlobalInstructionSelect() override;
840 bool addGlobalInstructionSelect() override;
841 void addFastRegAlloc() override;
842 void addOptimizedRegAlloc() override;
843
844 FunctionPass *createSGPRAllocPass(bool Optimized);
845 FunctionPass *createVGPRAllocPass(bool Optimized);
846 FunctionPass *createRegAllocPass(bool Optimized) override;
847
848 bool addRegAssignAndRewriteFast() override;
849 bool addRegAssignAndRewriteOptimized() override;
850
851 void addPreRegAlloc() override;
852 bool addPreRewrite() override;
853 void addPostRegAlloc() override;
854 void addPreSched2() override;
855 void addPreEmitPass() override;
856};
857
858} // end anonymous namespace
859
860AMDGPUPassConfig::AMDGPUPassConfig(LLVMTargetMachine &TM, PassManagerBase &PM)
861 : TargetPassConfig(TM, PM) {
862 // Exceptions and StackMaps are not supported, so these passes will never do
863 // anything.
864 disablePass(&StackMapLivenessID);
865 disablePass(&FuncletLayoutID);
866 // Garbage collection is not supported.
867 disablePass(&GCLoweringID);
868 disablePass(&ShadowStackGCLoweringID);
869}
870
871void AMDGPUPassConfig::addEarlyCSEOrGVNPass() {
872 if (getOptLevel() == CodeGenOpt::Aggressive)
873 addPass(createGVNPass());
874 else
875 addPass(createEarlyCSEPass());
876}
877
878void AMDGPUPassConfig::addStraightLineScalarOptimizationPasses() {
879 addPass(createLICMPass());
880 addPass(createSeparateConstOffsetFromGEPPass());
881 addPass(createSpeculativeExecutionPass());
882 // ReassociateGEPs exposes more opportunities for SLSR. See
883 // the example in reassociate-geps-and-slsr.ll.
884 addPass(createStraightLineStrengthReducePass());
885 // SeparateConstOffsetFromGEP and SLSR creates common expressions which GVN or
886 // EarlyCSE can reuse.
887 addEarlyCSEOrGVNPass();
888 // Run NaryReassociate after EarlyCSE/GVN to be more effective.
889 addPass(createNaryReassociatePass());
890 // NaryReassociate on GEPs creates redundant common expressions, so run
891 // EarlyCSE after it.
892 addPass(createEarlyCSEPass());
893}
894
895void AMDGPUPassConfig::addIRPasses() {
896 const AMDGPUTargetMachine &TM = getAMDGPUTargetMachine();
897
898 // There is no reason to run these.
899 disablePass(&StackMapLivenessID);
900 disablePass(&FuncletLayoutID);
901 disablePass(&PatchableFunctionID);
902
903 addPass(createAMDGPUPrintfRuntimeBinding());
904 addPass(createAMDGPUCtorDtorLoweringPass());
905
906 // This must occur before inlining, as the inliner will not look through
907 // bitcast calls.
908 addPass(createAMDGPUFixFunctionBitcastsPass());
909
910 // A call to propagate attributes pass in the backend in case opt was not run.
911 addPass(createAMDGPUPropagateAttributesEarlyPass(&TM));
912
913 addPass(createAMDGPULowerIntrinsicsPass());
914
915 // Function calls are not supported, so make sure we inline everything.
916 addPass(createAMDGPUAlwaysInlinePass());
917 addPass(createAlwaysInlinerLegacyPass());
918 // We need to add the barrier noop pass, otherwise adding the function
919 // inlining pass will cause all of the PassConfigs passes to be run
920 // one function at a time, which means if we have a nodule with two
921 // functions, then we will generate code for the first function
922 // without ever running any passes on the second.
923 addPass(createBarrierNoopPass());
924
925 // Handle uses of OpenCL image2d_t, image3d_t and sampler_t arguments.
926 if (TM.getTargetTriple().getArch() == Triple::r600)
927 addPass(createR600OpenCLImageTypeLoweringPass());
928
929 // Replace OpenCL enqueued block function pointers with global variables.
930 addPass(createAMDGPUOpenCLEnqueuedBlockLoweringPass());
931
932 // Can increase LDS used by kernel so runs before PromoteAlloca
933 if (EnableLowerModuleLDS) {
934 // The pass "amdgpu-replace-lds-use-with-pointer" need to be run before the
935 // pass "amdgpu-lower-module-lds", and also it required to be run only if
936 // "amdgpu-lower-module-lds" pass is enabled.
937 if (EnableLDSReplaceWithPointer)
938 addPass(createAMDGPUReplaceLDSUseWithPointerPass());
939
940 addPass(createAMDGPULowerModuleLDSPass());
941 }
942
943 if (TM.getOptLevel() > CodeGenOpt::None)
944 addPass(createInferAddressSpacesPass());
945
946 addPass(createAtomicExpandPass());
947
948 if (TM.getOptLevel() > CodeGenOpt::None) {
949 addPass(createAMDGPUPromoteAlloca());
950
951 if (EnableSROA)
952 addPass(createSROAPass());
953 if (isPassEnabled(EnableScalarIRPasses))
954 addStraightLineScalarOptimizationPasses();
955
956 if (EnableAMDGPUAliasAnalysis) {
957 addPass(createAMDGPUAAWrapperPass());
958 addPass(createExternalAAWrapperPass([](Pass &P, Function &,
959 AAResults &AAR) {
960 if (auto *WrapperPass = P.getAnalysisIfAvailable<AMDGPUAAWrapperPass>())
961 AAR.addAAResult(WrapperPass->getResult());
962 }));
963 }
964
965 if (TM.getTargetTriple().getArch() == Triple::amdgcn) {
966 // TODO: May want to move later or split into an early and late one.
967 addPass(createAMDGPUCodeGenPreparePass());
968 }
969 }
970
971 TargetPassConfig::addIRPasses();
972
973 // EarlyCSE is not always strong enough to clean up what LSR produces. For
974 // example, GVN can combine
975 //
976 // %0 = add %a, %b
977 // %1 = add %b, %a
978 //
979 // and
980 //
981 // %0 = shl nsw %a, 2
982 // %1 = shl %a, 2
983 //
984 // but EarlyCSE can do neither of them.
985 if (isPassEnabled(EnableScalarIRPasses))
986 addEarlyCSEOrGVNPass();
987}
988
989void AMDGPUPassConfig::addCodeGenPrepare() {
990 if (TM->getTargetTriple().getArch() == Triple::amdgcn) {
991 addPass(createAMDGPUAttributorPass());
992
993 // FIXME: This pass adds 2 hacky attributes that can be replaced with an
994 // analysis, and should be removed.
995 addPass(createAMDGPUAnnotateKernelFeaturesPass());
996 }
997
998 if (TM->getTargetTriple().getArch() == Triple::amdgcn &&
999 EnableLowerKernelArguments)
1000 addPass(createAMDGPULowerKernelArgumentsPass());
1001
1002 TargetPassConfig::addCodeGenPrepare();
1003
1004 if (isPassEnabled(EnableLoadStoreVectorizer))
1005 addPass(createLoadStoreVectorizerPass());
1006
1007 // LowerSwitch pass may introduce unreachable blocks that can
1008 // cause unexpected behavior for subsequent passes. Placing it
1009 // here seems better that these blocks would get cleaned up by
1010 // UnreachableBlockElim inserted next in the pass flow.
1011 addPass(createLowerSwitchPass());
1012}
1013
1014bool AMDGPUPassConfig::addPreISel() {
1015 if (TM->getOptLevel() > CodeGenOpt::None)
1016 addPass(createFlattenCFGPass());
1017 return false;
1018}
1019
1020bool AMDGPUPassConfig::addInstSelector() {
1021 // Defer the verifier until FinalizeISel.
1022 addPass(createAMDGPUISelDag(&getAMDGPUTargetMachine(), getOptLevel()), false);
1023 return false;
1024}
1025
1026bool AMDGPUPassConfig::addGCPasses() {
1027 // Do nothing. GC is not supported.
1028 return false;
1029}
1030
1031llvm::ScheduleDAGInstrs *
1032AMDGPUPassConfig::createMachineScheduler(MachineSchedContext *C) const {
1033 ScheduleDAGMILive *DAG = createGenericSchedLive(C);
1034 DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
1035 return DAG;
1036}
1037
1038//===----------------------------------------------------------------------===//
1039// GCN Pass Setup
1040//===----------------------------------------------------------------------===//
1041
1042ScheduleDAGInstrs *GCNPassConfig::createMachineScheduler(
1043 MachineSchedContext *C) const {
1044 const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>();
1045 if (ST.enableSIScheduler())
1046 return createSIMachineScheduler(C);
1047 return createGCNMaxOccupancyMachineScheduler(C);
1048}
1049
1050bool GCNPassConfig::addPreISel() {
1051 AMDGPUPassConfig::addPreISel();
1052
1053 if (TM->getOptLevel() > CodeGenOpt::None)
1054 addPass(createAMDGPULateCodeGenPreparePass());
1055
1056 if (isPassEnabled(EnableAtomicOptimizations, CodeGenOpt::Less)) {
1057 addPass(createAMDGPUAtomicOptimizerPass());
1058 }
1059
1060 if (TM->getOptLevel() > CodeGenOpt::None)
1061 addPass(createSinkingPass());
1062
1063 // Merge divergent exit nodes. StructurizeCFG won't recognize the multi-exit
1064 // regions formed by them.
1065 addPass(&AMDGPUUnifyDivergentExitNodesID);
1066 if (!LateCFGStructurize) {
1067 if (EnableStructurizerWorkarounds) {
1068 addPass(createFixIrreduciblePass());
1069 addPass(createUnifyLoopExitsPass());
1070 }
1071 addPass(createStructurizeCFGPass(false)); // true -> SkipUniformRegions
1072 }
1073 addPass(createAMDGPUAnnotateUniformValues());
1074 if (!LateCFGStructurize) {
1075 addPass(createSIAnnotateControlFlowPass());
1076 }
1077 addPass(createLCSSAPass());
1078
1079 if (TM->getOptLevel() > CodeGenOpt::Less)
1080 addPass(&AMDGPUPerfHintAnalysisID);
1081
1082 return false;
1083}
1084
1085void GCNPassConfig::addMachineSSAOptimization() {
1086 TargetPassConfig::addMachineSSAOptimization();
1087
1088 // We want to fold operands after PeepholeOptimizer has run (or as part of
1089 // it), because it will eliminate extra copies making it easier to fold the
1090 // real source operand. We want to eliminate dead instructions after, so that
1091 // we see fewer uses of the copies. We then need to clean up the dead
1092 // instructions leftover after the operands are folded as well.
1093 //
1094 // XXX - Can we get away without running DeadMachineInstructionElim again?
1095 addPass(&SIFoldOperandsID);
1096 if (EnableDPPCombine)
1097 addPass(&GCNDPPCombineID);
1098 addPass(&SILoadStoreOptimizerID);
1099 if (isPassEnabled(EnableSDWAPeephole)) {
1100 addPass(&SIPeepholeSDWAID);
1101 addPass(&EarlyMachineLICMID);
1102 addPass(&MachineCSEID);
1103 addPass(&SIFoldOperandsID);
1104 }
1105 addPass(&DeadMachineInstructionElimID);
1106 addPass(createSIShrinkInstructionsPass());
1107}
1108
1109bool GCNPassConfig::addILPOpts() {
1110 if (EnableEarlyIfConversion)
1111 addPass(&EarlyIfConverterID);
1112
1113 TargetPassConfig::addILPOpts();
1114 return false;
1115}
1116
1117bool GCNPassConfig::addInstSelector() {
1118 AMDGPUPassConfig::addInstSelector();
1119 addPass(&SIFixSGPRCopiesID);
1120 addPass(createSILowerI1CopiesPass());
1121 return false;
1122}
1123
1124bool GCNPassConfig::addIRTranslator() {
1125 addPass(new IRTranslator(getOptLevel()));
1126 return false;
1127}
1128
1129void GCNPassConfig::addPreLegalizeMachineIR() {
1130 bool IsOptNone = getOptLevel() == CodeGenOpt::None;
1131 addPass(createAMDGPUPreLegalizeCombiner(IsOptNone));
1132 addPass(new Localizer());
1133}
1134
1135bool GCNPassConfig::addLegalizeMachineIR() {
1136 addPass(new Legalizer());
1137 return false;
1138}
1139
1140void GCNPassConfig::addPreRegBankSelect() {
1141 bool IsOptNone = getOptLevel() == CodeGenOpt::None;
1142 addPass(createAMDGPUPostLegalizeCombiner(IsOptNone));
1143}
1144
1145bool GCNPassConfig::addRegBankSelect() {
1146 addPass(new RegBankSelect());
1147 return false;
1148}
1149
1150void GCNPassConfig::addPreGlobalInstructionSelect() {
1151 bool IsOptNone = getOptLevel() == CodeGenOpt::None;
1152 addPass(createAMDGPURegBankCombiner(IsOptNone));
1153}
1154
1155bool GCNPassConfig::addGlobalInstructionSelect() {
1156 addPass(new InstructionSelect(getOptLevel()));
1157 return false;
1158}
1159
1160void GCNPassConfig::addPreRegAlloc() {
1161 if (LateCFGStructurize) {
1162 addPass(createAMDGPUMachineCFGStructurizerPass());
1163 }
1164}
1165
1166void GCNPassConfig::addFastRegAlloc() {
1167 // FIXME: We have to disable the verifier here because of PHIElimination +
1168 // TwoAddressInstructions disabling it.
1169
1170 // This must be run immediately after phi elimination and before
1171 // TwoAddressInstructions, otherwise the processing of the tied operand of
1172 // SI_ELSE will introduce a copy of the tied operand source after the else.
1173 insertPass(&PHIEliminationID, &SILowerControlFlowID, false);
1174
1175 insertPass(&TwoAddressInstructionPassID, &SIWholeQuadModeID);
1176 insertPass(&TwoAddressInstructionPassID, &SIPreAllocateWWMRegsID);
1177
1178 TargetPassConfig::addFastRegAlloc();
1179}
1180
1181void GCNPassConfig::addOptimizedRegAlloc() {
1182 // Allow the scheduler to run before SIWholeQuadMode inserts exec manipulation
1183 // instructions that cause scheduling barriers.
1184 insertPass(&MachineSchedulerID, &SIWholeQuadModeID);
1185 insertPass(&MachineSchedulerID, &SIPreAllocateWWMRegsID);
1186
1187 if (OptExecMaskPreRA)
1188 insertPass(&MachineSchedulerID, &SIOptimizeExecMaskingPreRAID);
1189
1190 if (isPassEnabled(EnablePreRAOptimizations))
1191 insertPass(&RenameIndependentSubregsID, &GCNPreRAOptimizationsID);
1192
1193 // This is not an essential optimization and it has a noticeable impact on
1194 // compilation time, so we only enable it from O2.
1195 if (TM->getOptLevel() > CodeGenOpt::Less)
1196 insertPass(&MachineSchedulerID, &SIFormMemoryClausesID);
1197
1198 // FIXME: when an instruction has a Killed operand, and the instruction is
1199 // inside a bundle, seems only the BUNDLE instruction appears as the Kills of
1200 // the register in LiveVariables, this would trigger a failure in verifier,
1201 // we should fix it and enable the verifier.
1202 if (OptVGPRLiveRange)
1203 insertPass(&LiveVariablesID, &SIOptimizeVGPRLiveRangeID, false);
1204 // This must be run immediately after phi elimination and before
1205 // TwoAddressInstructions, otherwise the processing of the tied operand of
1206 // SI_ELSE will introduce a copy of the tied operand source after the else.
1207 insertPass(&PHIEliminationID, &SILowerControlFlowID, false);
1208
1209 if (EnableDCEInRA)
1210 insertPass(&DetectDeadLanesID, &DeadMachineInstructionElimID);
1211
1212 TargetPassConfig::addOptimizedRegAlloc();
1213}
1214
1215bool GCNPassConfig::addPreRewrite() {
1216 if (EnableRegReassign)
1217 addPass(&GCNNSAReassignID);
1218 return true;
1219}
1220
1221FunctionPass *GCNPassConfig::createSGPRAllocPass(bool Optimized) {
1222 // Initialize the global default.
1223 llvm::call_once(InitializeDefaultSGPRRegisterAllocatorFlag,
1224 initializeDefaultSGPRRegisterAllocatorOnce);
1225
1226 RegisterRegAlloc::FunctionPassCtor Ctor = SGPRRegisterRegAlloc::getDefault();
1227 if (Ctor != useDefaultRegisterAllocator)
1228 return Ctor();
1229
1230 if (Optimized)
1231 return createGreedyRegisterAllocator(onlyAllocateSGPRs);
1232
1233 return createFastRegisterAllocator(onlyAllocateSGPRs, false);
1234}
1235
1236FunctionPass *GCNPassConfig::createVGPRAllocPass(bool Optimized) {
1237 // Initialize the global default.
1238 llvm::call_once(InitializeDefaultVGPRRegisterAllocatorFlag,
1239 initializeDefaultVGPRRegisterAllocatorOnce);
1240
1241 RegisterRegAlloc::FunctionPassCtor Ctor = VGPRRegisterRegAlloc::getDefault();
1242 if (Ctor != useDefaultRegisterAllocator)
1243 return Ctor();
1244
1245 if (Optimized)
1246 return createGreedyVGPRRegisterAllocator();
1247
1248 return createFastVGPRRegisterAllocator();
1249}
1250
1251FunctionPass *GCNPassConfig::createRegAllocPass(bool Optimized) {
1252 llvm_unreachable("should not be used")::llvm::llvm_unreachable_internal("should not be used", "/build/llvm-toolchain-snapshot-14~++20210926122410+d23fd8ae8906/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp"
, 1252)
;
1253}
1254
1255static const char RegAllocOptNotSupportedMessage[] =
1256 "-regalloc not supported with amdgcn. Use -sgpr-regalloc and -vgpr-regalloc";
1257
1258bool GCNPassConfig::addRegAssignAndRewriteFast() {
1259 if (!usingDefaultRegAlloc())
1260 report_fatal_error(RegAllocOptNotSupportedMessage);
1261
1262 addPass(createSGPRAllocPass(false));
1263
1264 // Equivalent of PEI for SGPRs.
1265 addPass(&SILowerSGPRSpillsID);
1266
1267 addPass(createVGPRAllocPass(false));
1268 return true;
1269}
1270
1271bool GCNPassConfig::addRegAssignAndRewriteOptimized() {
1272 if (!usingDefaultRegAlloc())
1273 report_fatal_error(RegAllocOptNotSupportedMessage);
1274
1275 addPass(createSGPRAllocPass(true));
1276
1277 // Commit allocated register changes. This is mostly necessary because too
1278 // many things rely on the use lists of the physical registers, such as the
1279 // verifier. This is only necessary with allocators which use LiveIntervals,
1280 // since FastRegAlloc does the replacements itself.
1281 addPass(createVirtRegRewriter(false));
1282
1283 // Equivalent of PEI for SGPRs.
1284 addPass(&SILowerSGPRSpillsID);
1285
1286 addPass(createVGPRAllocPass(true));
1287
1288 addPreRewrite();
1289 addPass(&VirtRegRewriterID);
1290
1291 return true;
1292}
1293
1294void GCNPassConfig::addPostRegAlloc() {
1295 addPass(&SIFixVGPRCopiesID);
1296 if (getOptLevel() > CodeGenOpt::None)
1297 addPass(&SIOptimizeExecMaskingID);
1298 TargetPassConfig::addPostRegAlloc();
1299}
1300
1301void GCNPassConfig::addPreSched2() {
1302 addPass(&SIPostRABundlerID);
1303}
1304
1305void GCNPassConfig::addPreEmitPass() {
1306 addPass(createSIMemoryLegalizerPass());
1307 addPass(createSIInsertWaitcntsPass());
1308
1309 if (TM->getOptLevel() > CodeGenOpt::None)
1310 addPass(createSIShrinkInstructionsPass());
1311
1312 addPass(createSIModeRegisterPass());
1313
1314 if (getOptLevel() > CodeGenOpt::None)
1315 addPass(&SIInsertHardClausesID);
1316
1317 addPass(&SILateBranchLoweringPassID);
1318 if (getOptLevel() > CodeGenOpt::None)
1319 addPass(&SIPreEmitPeepholeID);
1320 // The hazard recognizer that runs as part of the post-ra scheduler does not
1321 // guarantee to be able handle all hazards correctly. This is because if there
1322 // are multiple scheduling regions in a basic block, the regions are scheduled
1323 // bottom up, so when we begin to schedule a region we don't know what
1324 // instructions were emitted directly before it.
1325 //
1326 // Here we add a stand-alone hazard recognizer pass which can handle all
1327 // cases.
1328 addPass(&PostRAHazardRecognizerID);
1329 addPass(&BranchRelaxationPassID);
1330}
1331
1332TargetPassConfig *GCNTargetMachine::createPassConfig(PassManagerBase &PM) {
1333 return new GCNPassConfig(*this, PM);
1334}
1335
1336yaml::MachineFunctionInfo *GCNTargetMachine::createDefaultFuncInfoYAML() const {
1337 return new yaml::SIMachineFunctionInfo();
1338}
1339
1340yaml::MachineFunctionInfo *
1341GCNTargetMachine::convertFuncInfoToYAML(const MachineFunction &MF) const {
1342 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1343 return new yaml::SIMachineFunctionInfo(
1344 *MFI, *MF.getSubtarget().getRegisterInfo(), MF);
1345}
1346
1347bool GCNTargetMachine::parseMachineFunctionInfo(
1348 const yaml::MachineFunctionInfo &MFI_, PerFunctionMIParsingState &PFS,
1349 SMDiagnostic &Error, SMRange &SourceRange) const {
1350 const yaml::SIMachineFunctionInfo &YamlMFI =
1351 reinterpret_cast<const yaml::SIMachineFunctionInfo &>(MFI_);
1352 MachineFunction &MF = PFS.MF;
1353 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1354
1355 if (MFI->initializeBaseYamlFields(YamlMFI, MF, PFS, Error, SourceRange))
1356 return true;
1357
1358 if (MFI->Occupancy == 0) {
1359 // Fixup the subtarget dependent default value.
1360 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1361 MFI->Occupancy = ST.computeOccupancy(MF.getFunction(), MFI->getLDSSize());
1362 }
1363
1364 auto parseRegister = [&](const yaml::StringValue &RegName, Register &RegVal) {
1365 Register TempReg;
1366 if (parseNamedRegisterReference(PFS, TempReg, RegName.Value, Error)) {
1367 SourceRange = RegName.SourceRange;
1368 return true;
1369 }
1370 RegVal = TempReg;
1371
1372 return false;
1373 };
1374
1375 auto diagnoseRegisterClass = [&](const yaml::StringValue &RegName) {
1376 // Create a diagnostic for a the register string literal.
1377 const MemoryBuffer &Buffer =
1378 *PFS.SM->getMemoryBuffer(PFS.SM->getMainFileID());
1379 Error = SMDiagnostic(*PFS.SM, SMLoc(), Buffer.getBufferIdentifier(), 1,
1380 RegName.Value.size(), SourceMgr::DK_Error,
1381 "incorrect register class for field", RegName.Value,
1382 None, None);
1383 SourceRange = RegName.SourceRange;
1384 return true;
1385 };
1386
1387 if (parseRegister(YamlMFI.ScratchRSrcReg, MFI->ScratchRSrcReg) ||
1388 parseRegister(YamlMFI.FrameOffsetReg, MFI->FrameOffsetReg) ||
1389 parseRegister(YamlMFI.StackPtrOffsetReg, MFI->StackPtrOffsetReg))
1390 return true;
1391
1392 if (MFI->ScratchRSrcReg != AMDGPU::PRIVATE_RSRC_REG &&
1393 !AMDGPU::SGPR_128RegClass.contains(MFI->ScratchRSrcReg)) {
1394 return diagnoseRegisterClass(YamlMFI.ScratchRSrcReg);
1395 }
1396
1397 if (MFI->FrameOffsetReg != AMDGPU::FP_REG &&
1398 !AMDGPU::SGPR_32RegClass.contains(MFI->FrameOffsetReg)) {
1399 return diagnoseRegisterClass(YamlMFI.FrameOffsetReg);
1400 }
1401
1402 if (MFI->StackPtrOffsetReg != AMDGPU::SP_REG &&
1403 !AMDGPU::SGPR_32RegClass.contains(MFI->StackPtrOffsetReg)) {
1404 return diagnoseRegisterClass(YamlMFI.StackPtrOffsetReg);
1405 }
1406
1407 auto parseAndCheckArgument = [&](const Optional<yaml::SIArgument> &A,
1408 const TargetRegisterClass &RC,
1409 ArgDescriptor &Arg, unsigned UserSGPRs,
1410 unsigned SystemSGPRs) {
1411 // Skip parsing if it's not present.
1412 if (!A)
1413 return false;
1414
1415 if (A->IsRegister) {
1416 Register Reg;
1417 if (parseNamedRegisterReference(PFS, Reg, A->RegisterName.Value, Error)) {
1418 SourceRange = A->RegisterName.SourceRange;
1419 return true;
1420 }
1421 if (!RC.contains(Reg))
1422 return diagnoseRegisterClass(A->RegisterName);
1423 Arg = ArgDescriptor::createRegister(Reg);
1424 } else
1425 Arg = ArgDescriptor::createStack(A->StackOffset);
1426 // Check and apply the optional mask.
1427 if (A->Mask)
1428 Arg = ArgDescriptor::createArg(Arg, A->Mask.getValue());
1429
1430 MFI->NumUserSGPRs += UserSGPRs;
1431 MFI->NumSystemSGPRs += SystemSGPRs;
1432 return false;
1433 };
1434
1435 if (YamlMFI.ArgInfo &&
1436 (parseAndCheckArgument(YamlMFI.ArgInfo->PrivateSegmentBuffer,
1437 AMDGPU::SGPR_128RegClass,
1438 MFI->ArgInfo.PrivateSegmentBuffer, 4, 0) ||
1439 parseAndCheckArgument(YamlMFI.ArgInfo->DispatchPtr,
1440 AMDGPU::SReg_64RegClass, MFI->ArgInfo.DispatchPtr,
1441 2, 0) ||
1442 parseAndCheckArgument(YamlMFI.ArgInfo->QueuePtr, AMDGPU::SReg_64RegClass,
1443 MFI->ArgInfo.QueuePtr, 2, 0) ||
1444 parseAndCheckArgument(YamlMFI.ArgInfo->KernargSegmentPtr,
1445 AMDGPU::SReg_64RegClass,
1446 MFI->ArgInfo.KernargSegmentPtr, 2, 0) ||
1447 parseAndCheckArgument(YamlMFI.ArgInfo->DispatchID,
1448 AMDGPU::SReg_64RegClass, MFI->ArgInfo.DispatchID,
1449 2, 0) ||
1450 parseAndCheckArgument(YamlMFI.ArgInfo->FlatScratchInit,
1451 AMDGPU::SReg_64RegClass,
1452 MFI->ArgInfo.FlatScratchInit, 2, 0) ||
1453 parseAndCheckArgument(YamlMFI.ArgInfo->PrivateSegmentSize,
1454 AMDGPU::SGPR_32RegClass,
1455 MFI->ArgInfo.PrivateSegmentSize, 0, 0) ||
1456 parseAndCheckArgument(YamlMFI.ArgInfo->WorkGroupIDX,
1457 AMDGPU::SGPR_32RegClass, MFI->ArgInfo.WorkGroupIDX,
1458 0, 1) ||
1459 parseAndCheckArgument(YamlMFI.ArgInfo->WorkGroupIDY,
1460 AMDGPU::SGPR_32RegClass, MFI->ArgInfo.WorkGroupIDY,
1461 0, 1) ||
1462 parseAndCheckArgument(YamlMFI.ArgInfo->WorkGroupIDZ,
1463 AMDGPU::SGPR_32RegClass, MFI->ArgInfo.WorkGroupIDZ,
1464 0, 1) ||
1465 parseAndCheckArgument(YamlMFI.ArgInfo->WorkGroupInfo,
1466 AMDGPU::SGPR_32RegClass,
1467 MFI->ArgInfo.WorkGroupInfo, 0, 1) ||
1468 parseAndCheckArgument(YamlMFI.ArgInfo->PrivateSegmentWaveByteOffset,
1469 AMDGPU::SGPR_32RegClass,
1470 MFI->ArgInfo.PrivateSegmentWaveByteOffset, 0, 1) ||
1471 parseAndCheckArgument(YamlMFI.ArgInfo->ImplicitArgPtr,
1472 AMDGPU::SReg_64RegClass,
1473 MFI->ArgInfo.ImplicitArgPtr, 0, 0) ||
1474 parseAndCheckArgument(YamlMFI.ArgInfo->ImplicitBufferPtr,
1475 AMDGPU::SReg_64RegClass,
1476 MFI->ArgInfo.ImplicitBufferPtr, 2, 0) ||
1477 parseAndCheckArgument(YamlMFI.ArgInfo->WorkItemIDX,
1478 AMDGPU::VGPR_32RegClass,
1479 MFI->ArgInfo.WorkItemIDX, 0, 0) ||
1480 parseAndCheckArgument(YamlMFI.ArgInfo->WorkItemIDY,
1481 AMDGPU::VGPR_32RegClass,
1482 MFI->ArgInfo.WorkItemIDY, 0, 0) ||
1483 parseAndCheckArgument(YamlMFI.ArgInfo->WorkItemIDZ,
1484 AMDGPU::VGPR_32RegClass,
1485 MFI->ArgInfo.WorkItemIDZ, 0, 0)))
1486 return true;
1487
1488 MFI->Mode.IEEE = YamlMFI.Mode.IEEE;
1489 MFI->Mode.DX10Clamp = YamlMFI.Mode.DX10Clamp;
1490 MFI->Mode.FP32InputDenormals = YamlMFI.Mode.FP32InputDenormals;
1491 MFI->Mode.FP32OutputDenormals = YamlMFI.Mode.FP32OutputDenormals;
1492 MFI->Mode.FP64FP16InputDenormals = YamlMFI.Mode.FP64FP16InputDenormals;
1493 MFI->Mode.FP64FP16OutputDenormals = YamlMFI.Mode.FP64FP16OutputDenormals;
1494
1495 return false;
1496}