Line data Source code
1 : //===-- AMDGPUTargetMachine.cpp - TargetMachine for hw codegen targets-----===//
2 : //
3 : // The LLVM Compiler Infrastructure
4 : //
5 : // This file is distributed under the University of Illinois Open Source
6 : // License. See LICENSE.TXT for details.
7 : //
8 : //===----------------------------------------------------------------------===//
9 : //
10 : /// \file
11 : /// The AMDGPU target machine contains all of the hardware specific
12 : /// information needed to emit code for R600 and SI GPUs.
13 : //
14 : //===----------------------------------------------------------------------===//
15 :
16 : #include "AMDGPUTargetMachine.h"
17 : #include "AMDGPU.h"
18 : #include "AMDGPUAliasAnalysis.h"
19 : #include "AMDGPUCallLowering.h"
20 : #include "AMDGPUInstructionSelector.h"
21 : #include "AMDGPULegalizerInfo.h"
22 : #include "AMDGPUMacroFusion.h"
23 : #include "AMDGPUTargetObjectFile.h"
24 : #include "AMDGPUTargetTransformInfo.h"
25 : #include "GCNIterativeScheduler.h"
26 : #include "GCNSchedStrategy.h"
27 : #include "R600MachineScheduler.h"
28 : #include "SIMachineScheduler.h"
29 : #include "llvm/CodeGen/GlobalISel/IRTranslator.h"
30 : #include "llvm/CodeGen/GlobalISel/InstructionSelect.h"
31 : #include "llvm/CodeGen/GlobalISel/Legalizer.h"
32 : #include "llvm/CodeGen/GlobalISel/RegBankSelect.h"
33 : #include "llvm/CodeGen/Passes.h"
34 : #include "llvm/CodeGen/TargetPassConfig.h"
35 : #include "llvm/IR/Attributes.h"
36 : #include "llvm/IR/Function.h"
37 : #include "llvm/IR/LegacyPassManager.h"
38 : #include "llvm/Pass.h"
39 : #include "llvm/Support/CommandLine.h"
40 : #include "llvm/Support/Compiler.h"
41 : #include "llvm/Support/TargetRegistry.h"
42 : #include "llvm/Target/TargetLoweringObjectFile.h"
43 : #include "llvm/Transforms/IPO.h"
44 : #include "llvm/Transforms/IPO/AlwaysInliner.h"
45 : #include "llvm/Transforms/IPO/PassManagerBuilder.h"
46 : #include "llvm/Transforms/Scalar.h"
47 : #include "llvm/Transforms/Scalar/GVN.h"
48 : #include "llvm/Transforms/Utils.h"
49 : #include "llvm/Transforms/Vectorize.h"
50 : #include <memory>
51 :
52 : using namespace llvm;
53 :
54 : static cl::opt<bool> EnableR600StructurizeCFG(
55 : "r600-ir-structurize",
56 : cl::desc("Use StructurizeCFG IR pass"),
57 : cl::init(true));
58 :
59 : static cl::opt<bool> EnableSROA(
60 : "amdgpu-sroa",
61 : cl::desc("Run SROA after promote alloca pass"),
62 : cl::ReallyHidden,
63 : cl::init(true));
64 :
65 : static cl::opt<bool>
66 : EnableEarlyIfConversion("amdgpu-early-ifcvt", cl::Hidden,
67 : cl::desc("Run early if-conversion"),
68 : cl::init(false));
69 :
70 : static cl::opt<bool> EnableR600IfConvert(
71 : "r600-if-convert",
72 : cl::desc("Use if conversion pass"),
73 : cl::ReallyHidden,
74 : cl::init(true));
75 :
76 : // Option to disable vectorizer for tests.
77 : static cl::opt<bool> EnableLoadStoreVectorizer(
78 : "amdgpu-load-store-vectorizer",
79 : cl::desc("Enable load store vectorizer"),
80 : cl::init(true),
81 : cl::Hidden);
82 :
83 : // Option to control global loads scalarization
84 : static cl::opt<bool> ScalarizeGlobal(
85 : "amdgpu-scalarize-global-loads",
86 : cl::desc("Enable global load scalarization"),
87 : cl::init(true),
88 : cl::Hidden);
89 :
90 : // Option to run internalize pass.
91 : static cl::opt<bool> InternalizeSymbols(
92 : "amdgpu-internalize-symbols",
93 : cl::desc("Enable elimination of non-kernel functions and unused globals"),
94 : cl::init(false),
95 : cl::Hidden);
96 :
97 : // Option to inline all early.
98 : static cl::opt<bool> EarlyInlineAll(
99 : "amdgpu-early-inline-all",
100 : cl::desc("Inline all functions early"),
101 : cl::init(false),
102 : cl::Hidden);
103 :
104 : static cl::opt<bool> EnableSDWAPeephole(
105 : "amdgpu-sdwa-peephole",
106 : cl::desc("Enable SDWA peepholer"),
107 : cl::init(true));
108 :
109 : // Enable address space based alias analysis
110 : static cl::opt<bool> EnableAMDGPUAliasAnalysis("enable-amdgpu-aa", cl::Hidden,
111 : cl::desc("Enable AMDGPU Alias Analysis"),
112 : cl::init(true));
113 :
114 : // Option to run late CFG structurizer
115 : static cl::opt<bool, true> LateCFGStructurize(
116 : "amdgpu-late-structurize",
117 : cl::desc("Enable late CFG structurization"),
118 : cl::location(AMDGPUTargetMachine::EnableLateStructurizeCFG),
119 : cl::Hidden);
120 :
121 : static cl::opt<bool, true> EnableAMDGPUFunctionCalls(
122 : "amdgpu-function-calls",
123 : cl::desc("Enable AMDGPU function call support"),
124 : cl::location(AMDGPUTargetMachine::EnableFunctionCalls),
125 : cl::init(false),
126 : cl::Hidden);
127 :
128 : // Enable lib calls simplifications
129 : static cl::opt<bool> EnableLibCallSimplify(
130 : "amdgpu-simplify-libcall",
131 : cl::desc("Enable amdgpu library simplifications"),
132 : cl::init(true),
133 : cl::Hidden);
134 :
135 : static cl::opt<bool> EnableLowerKernelArguments(
136 : "amdgpu-ir-lower-kernel-arguments",
137 : cl::desc("Lower kernel argument loads in IR pass"),
138 : cl::init(true),
139 : cl::Hidden);
140 :
141 : // Enable atomic optimization
142 : static cl::opt<bool> EnableAtomicOptimizations(
143 : "amdgpu-atomic-optimizations",
144 : cl::desc("Enable atomic optimizations"),
145 : cl::init(false),
146 : cl::Hidden);
147 :
148 113919 : extern "C" void LLVMInitializeAMDGPUTarget() {
149 : // Register the target
150 113919 : RegisterTargetMachine<R600TargetMachine> X(getTheAMDGPUTarget());
151 113919 : RegisterTargetMachine<GCNTargetMachine> Y(getTheGCNTarget());
152 :
153 113919 : PassRegistry *PR = PassRegistry::getPassRegistry();
154 113919 : initializeR600ClauseMergePassPass(*PR);
155 113919 : initializeR600ControlFlowFinalizerPass(*PR);
156 113919 : initializeR600PacketizerPass(*PR);
157 113919 : initializeR600ExpandSpecialInstrsPassPass(*PR);
158 113919 : initializeR600VectorRegMergerPass(*PR);
159 113919 : initializeGlobalISel(*PR);
160 113919 : initializeAMDGPUDAGToDAGISelPass(*PR);
161 113919 : initializeSILowerI1CopiesPass(*PR);
162 113919 : initializeSIFixSGPRCopiesPass(*PR);
163 113919 : initializeSIFixVGPRCopiesPass(*PR);
164 113919 : initializeSIFoldOperandsPass(*PR);
165 113919 : initializeSIPeepholeSDWAPass(*PR);
166 113919 : initializeSIShrinkInstructionsPass(*PR);
167 113919 : initializeSIOptimizeExecMaskingPreRAPass(*PR);
168 113919 : initializeSILoadStoreOptimizerPass(*PR);
169 113919 : initializeAMDGPUAlwaysInlinePass(*PR);
170 113919 : initializeAMDGPUAnnotateKernelFeaturesPass(*PR);
171 113919 : initializeAMDGPUAnnotateUniformValuesPass(*PR);
172 113919 : initializeAMDGPUArgumentUsageInfoPass(*PR);
173 113919 : initializeAMDGPUAtomicOptimizerPass(*PR);
174 113919 : initializeAMDGPULowerKernelArgumentsPass(*PR);
175 113919 : initializeAMDGPULowerKernelAttributesPass(*PR);
176 113919 : initializeAMDGPULowerIntrinsicsPass(*PR);
177 113919 : initializeAMDGPUOpenCLEnqueuedBlockLoweringPass(*PR);
178 113919 : initializeAMDGPUPromoteAllocaPass(*PR);
179 113919 : initializeAMDGPUCodeGenPreparePass(*PR);
180 113919 : initializeAMDGPURewriteOutArgumentsPass(*PR);
181 113919 : initializeAMDGPUUnifyMetadataPass(*PR);
182 113919 : initializeSIAnnotateControlFlowPass(*PR);
183 113919 : initializeSIInsertWaitcntsPass(*PR);
184 113919 : initializeSIWholeQuadModePass(*PR);
185 113919 : initializeSILowerControlFlowPass(*PR);
186 113919 : initializeSIInsertSkipsPass(*PR);
187 113919 : initializeSIMemoryLegalizerPass(*PR);
188 113919 : initializeSIDebuggerInsertNopsPass(*PR);
189 113919 : initializeSIOptimizeExecMaskingPass(*PR);
190 113919 : initializeSIFixWWMLivenessPass(*PR);
191 113919 : initializeSIFormMemoryClausesPass(*PR);
192 113919 : initializeAMDGPUUnifyDivergentExitNodesPass(*PR);
193 113919 : initializeAMDGPUAAWrapperPassPass(*PR);
194 113919 : initializeAMDGPUUseNativeCallsPass(*PR);
195 113919 : initializeAMDGPUSimplifyLibCallsPass(*PR);
196 113919 : initializeAMDGPUInlinerPass(*PR);
197 113919 : }
198 :
199 0 : static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) {
200 0 : return llvm::make_unique<AMDGPUTargetObjectFile>();
201 : }
202 :
203 2297 : static ScheduleDAGInstrs *createR600MachineScheduler(MachineSchedContext *C) {
204 4594 : return new ScheduleDAGMILive(C, llvm::make_unique<R600SchedStrategy>());
205 : }
206 :
207 1 : static ScheduleDAGInstrs *createSIMachineScheduler(MachineSchedContext *C) {
208 1 : return new SIScheduleDAGMI(C);
209 : }
210 :
211 : static ScheduleDAGInstrs *
212 19515 : createGCNMaxOccupancyMachineScheduler(MachineSchedContext *C) {
213 : ScheduleDAGMILive *DAG =
214 19515 : new GCNScheduleDAGMILive(C, make_unique<GCNMaxOccupancySchedStrategy>(C));
215 19515 : DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
216 19515 : DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
217 19515 : DAG->addMutation(createAMDGPUMacroFusionDAGMutation());
218 19515 : return DAG;
219 : }
220 :
221 : static ScheduleDAGInstrs *
222 3 : createIterativeGCNMaxOccupancyMachineScheduler(MachineSchedContext *C) {
223 : auto DAG = new GCNIterativeScheduler(C,
224 3 : GCNIterativeScheduler::SCHEDULE_LEGACYMAXOCCUPANCY);
225 3 : DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
226 3 : DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
227 3 : return DAG;
228 : }
229 :
230 3 : static ScheduleDAGInstrs *createMinRegScheduler(MachineSchedContext *C) {
231 : return new GCNIterativeScheduler(C,
232 3 : GCNIterativeScheduler::SCHEDULE_MINREGFORCED);
233 : }
234 :
235 : static ScheduleDAGInstrs *
236 2 : createIterativeILPMachineScheduler(MachineSchedContext *C) {
237 : auto DAG = new GCNIterativeScheduler(C,
238 2 : GCNIterativeScheduler::SCHEDULE_ILP);
239 2 : DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
240 2 : DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
241 2 : DAG->addMutation(createAMDGPUMacroFusionDAGMutation());
242 2 : return DAG;
243 : }
244 :
245 : static MachineSchedRegistry
246 : R600SchedRegistry("r600", "Run R600's custom scheduler",
247 : createR600MachineScheduler);
248 :
249 : static MachineSchedRegistry
250 : SISchedRegistry("si", "Run SI's custom scheduler",
251 : createSIMachineScheduler);
252 :
253 : static MachineSchedRegistry
254 : GCNMaxOccupancySchedRegistry("gcn-max-occupancy",
255 : "Run GCN scheduler to maximize occupancy",
256 : createGCNMaxOccupancyMachineScheduler);
257 :
258 : static MachineSchedRegistry
259 : IterativeGCNMaxOccupancySchedRegistry("gcn-max-occupancy-experimental",
260 : "Run GCN scheduler to maximize occupancy (experimental)",
261 : createIterativeGCNMaxOccupancyMachineScheduler);
262 :
263 : static MachineSchedRegistry
264 : GCNMinRegSchedRegistry("gcn-minreg",
265 : "Run GCN iterative scheduler for minimal register usage (experimental)",
266 : createMinRegScheduler);
267 :
268 : static MachineSchedRegistry
269 : GCNILPSchedRegistry("gcn-ilp",
270 : "Run GCN iterative scheduler for ILP scheduling (experimental)",
271 : createIterativeILPMachineScheduler);
272 :
273 : static StringRef computeDataLayout(const Triple &TT) {
274 2807 : if (TT.getArch() == Triple::r600) {
275 : // 32-bit pointers.
276 : return "e-p:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128"
277 : "-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5";
278 : }
279 :
280 : // 32-bit private, local, and region pointers. 64-bit global, constant and
281 : // flat.
282 : return "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32"
283 : "-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128"
284 : "-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5";
285 : }
286 :
287 : LLVM_READNONE
288 : static StringRef getGPUOrDefault(const Triple &TT, StringRef GPU) {
289 2807 : if (!GPU.empty())
290 : return GPU;
291 :
292 772 : if (TT.getArch() == Triple::amdgcn)
293 : return "generic";
294 :
295 : return "r600";
296 : }
297 :
298 0 : static Reloc::Model getEffectiveRelocModel(Optional<Reloc::Model> RM) {
299 : // The AMDGPU toolchain only supports generating shared objects, so we
300 : // must always use PIC.
301 0 : return Reloc::PIC_;
302 : }
303 :
304 : static CodeModel::Model getEffectiveCodeModel(Optional<CodeModel::Model> CM) {
305 2807 : if (CM)
306 : return *CM;
307 : return CodeModel::Small;
308 : }
309 :
310 2807 : AMDGPUTargetMachine::AMDGPUTargetMachine(const Target &T, const Triple &TT,
311 : StringRef CPU, StringRef FS,
312 : TargetOptions Options,
313 : Optional<Reloc::Model> RM,
314 : Optional<CodeModel::Model> CM,
315 2807 : CodeGenOpt::Level OptLevel)
316 : : LLVMTargetMachine(T, computeDataLayout(TT), TT, getGPUOrDefault(TT, CPU),
317 : FS, Options, getEffectiveRelocModel(RM),
318 : getEffectiveCodeModel(CM), OptLevel),
319 5614 : TLOF(createTLOF(getTargetTriple())) {
320 2807 : initAsmInfo();
321 2807 : }
322 :
323 : bool AMDGPUTargetMachine::EnableLateStructurizeCFG = false;
324 : bool AMDGPUTargetMachine::EnableFunctionCalls = false;
325 :
326 : AMDGPUTargetMachine::~AMDGPUTargetMachine() = default;
327 :
328 690446 : StringRef AMDGPUTargetMachine::getGPUName(const Function &F) const {
329 690446 : Attribute GPUAttr = F.getFnAttribute("target-cpu");
330 690446 : return GPUAttr.hasAttribute(Attribute::None) ?
331 690446 : getTargetCPU() : GPUAttr.getValueAsString();
332 : }
333 :
334 690446 : StringRef AMDGPUTargetMachine::getFeatureString(const Function &F) const {
335 690446 : Attribute FSAttr = F.getFnAttribute("target-features");
336 :
337 690446 : return FSAttr.hasAttribute(Attribute::None) ?
338 : getTargetFeatureString() :
339 690446 : FSAttr.getValueAsString();
340 : }
341 :
342 146 : static ImmutablePass *createAMDGPUExternalAAWrapperPass() {
343 146 : return createExternalAAWrapperPass([](Pass &P, Function &, AAResults &AAR) {
344 : if (auto *WrapperPass = P.getAnalysisIfAvailable<AMDGPUAAWrapperPass>())
345 : AAR.addAAResult(WrapperPass->getResult());
346 146 : });
347 : }
348 :
349 : /// Predicate for Internalize pass.
350 7 : static bool mustPreserveGV(const GlobalValue &GV) {
351 : if (const Function *F = dyn_cast<Function>(&GV))
352 5 : return F->isDeclaration() || AMDGPU::isEntryFunctionCC(F->getCallingConv());
353 :
354 2 : return !GV.use_empty();
355 : }
356 :
357 130 : void AMDGPUTargetMachine::adjustPassManager(PassManagerBuilder &Builder) {
358 130 : Builder.DivergentTarget = true;
359 :
360 130 : bool EnableOpt = getOptLevel() > CodeGenOpt::None;
361 : bool Internalize = InternalizeSymbols;
362 130 : bool EarlyInline = EarlyInlineAll && EnableOpt && !EnableAMDGPUFunctionCalls;
363 130 : bool AMDGPUAA = EnableAMDGPUAliasAnalysis && EnableOpt;
364 130 : bool LibCallSimplify = EnableLibCallSimplify && EnableOpt;
365 :
366 130 : if (EnableAMDGPUFunctionCalls) {
367 2 : delete Builder.Inliner;
368 2 : Builder.Inliner = createAMDGPUFunctionInliningPass();
369 : }
370 :
371 260 : Builder.addExtension(
372 : PassManagerBuilder::EP_ModuleOptimizerEarly,
373 : [Internalize, EarlyInline, AMDGPUAA](const PassManagerBuilder &,
374 : legacy::PassManagerBase &PM) {
375 : if (AMDGPUAA) {
376 : PM.add(createAMDGPUAAWrapperPass());
377 : PM.add(createAMDGPUExternalAAWrapperPass());
378 : }
379 : PM.add(createAMDGPUUnifyMetadataPass());
380 : if (Internalize) {
381 : PM.add(createInternalizePass(mustPreserveGV));
382 : PM.add(createGlobalDCEPass());
383 : }
384 : if (EarlyInline)
385 : PM.add(createAMDGPUAlwaysInlinePass(false));
386 : });
387 :
388 130 : const auto &Opt = Options;
389 390 : Builder.addExtension(
390 : PassManagerBuilder::EP_EarlyAsPossible,
391 : [AMDGPUAA, LibCallSimplify, &Opt](const PassManagerBuilder &,
392 : legacy::PassManagerBase &PM) {
393 : if (AMDGPUAA) {
394 : PM.add(createAMDGPUAAWrapperPass());
395 : PM.add(createAMDGPUExternalAAWrapperPass());
396 : }
397 : PM.add(llvm::createAMDGPUUseNativeCallsPass());
398 : if (LibCallSimplify)
399 : PM.add(llvm::createAMDGPUSimplifyLibCallsPass(Opt));
400 : });
401 :
402 130 : Builder.addExtension(
403 : PassManagerBuilder::EP_CGSCCOptimizerLate,
404 : [](const PassManagerBuilder &, legacy::PassManagerBase &PM) {
405 : // Add infer address spaces pass to the opt pipeline after inlining
406 : // but before SROA to increase SROA opportunities.
407 : PM.add(createInferAddressSpacesPass());
408 :
409 : // This should run after inlining to have any chance of doing anything,
410 : // and before other cleanup optimizations.
411 : PM.add(createAMDGPULowerKernelAttributesPass());
412 : });
413 130 : }
414 :
415 : //===----------------------------------------------------------------------===//
416 : // R600 Target Machine (R600 -> Cayman)
417 : //===----------------------------------------------------------------------===//
418 :
419 295 : R600TargetMachine::R600TargetMachine(const Target &T, const Triple &TT,
420 : StringRef CPU, StringRef FS,
421 : TargetOptions Options,
422 : Optional<Reloc::Model> RM,
423 : Optional<CodeModel::Model> CM,
424 295 : CodeGenOpt::Level OL, bool JIT)
425 590 : : AMDGPUTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL) {
426 : setRequiresStructuredCFG(true);
427 295 : }
428 :
429 59390 : const R600Subtarget *R600TargetMachine::getSubtargetImpl(
430 : const Function &F) const {
431 59390 : StringRef GPU = getGPUName(F);
432 59390 : StringRef FS = getFeatureString(F);
433 :
434 : SmallString<128> SubtargetKey(GPU);
435 : SubtargetKey.append(FS);
436 :
437 59390 : auto &I = SubtargetMap[SubtargetKey];
438 59390 : if (!I) {
439 : // This needs to be done before we create a new subtarget since any
440 : // creation will depend on the TM and the code generation flags on the
441 : // function that reside in TargetOptions.
442 291 : resetTargetOptions(F);
443 291 : I = llvm::make_unique<R600Subtarget>(TargetTriple, GPU, FS, *this);
444 : }
445 :
446 59390 : return I.get();
447 : }
448 :
449 : TargetTransformInfo
450 38193 : R600TargetMachine::getTargetTransformInfo(const Function &F) {
451 38193 : return TargetTransformInfo(R600TTIImpl(this, F));
452 : }
453 :
454 : //===----------------------------------------------------------------------===//
455 : // GCN Target Machine (SI+)
456 : //===----------------------------------------------------------------------===//
457 :
458 2512 : GCNTargetMachine::GCNTargetMachine(const Target &T, const Triple &TT,
459 : StringRef CPU, StringRef FS,
460 : TargetOptions Options,
461 : Optional<Reloc::Model> RM,
462 : Optional<CodeModel::Model> CM,
463 2512 : CodeGenOpt::Level OL, bool JIT)
464 7536 : : AMDGPUTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL) {}
465 :
466 631056 : const GCNSubtarget *GCNTargetMachine::getSubtargetImpl(const Function &F) const {
467 631056 : StringRef GPU = getGPUName(F);
468 631056 : StringRef FS = getFeatureString(F);
469 :
470 : SmallString<128> SubtargetKey(GPU);
471 : SubtargetKey.append(FS);
472 :
473 631056 : auto &I = SubtargetMap[SubtargetKey];
474 631056 : if (!I) {
475 : // This needs to be done before we create a new subtarget since any
476 : // creation will depend on the TM and the code generation flags on the
477 : // function that reside in TargetOptions.
478 2492 : resetTargetOptions(F);
479 4984 : I = llvm::make_unique<GCNSubtarget>(TargetTriple, GPU, FS, *this);
480 : }
481 :
482 : I->setScalarizeGlobalBehavior(ScalarizeGlobal);
483 :
484 631056 : return I.get();
485 : }
486 :
487 : TargetTransformInfo
488 396115 : GCNTargetMachine::getTargetTransformInfo(const Function &F) {
489 396115 : return TargetTransformInfo(GCNTTIImpl(this, F));
490 : }
491 :
492 : //===----------------------------------------------------------------------===//
493 : // AMDGPU Pass Setup
494 : //===----------------------------------------------------------------------===//
495 :
496 : namespace {
497 :
498 : class AMDGPUPassConfig : public TargetPassConfig {
499 : public:
500 2669 : AMDGPUPassConfig(LLVMTargetMachine &TM, PassManagerBase &PM)
501 2669 : : TargetPassConfig(TM, PM) {
502 : // Exceptions and StackMaps are not supported, so these passes will never do
503 : // anything.
504 2669 : disablePass(&StackMapLivenessID);
505 2669 : disablePass(&FuncletLayoutID);
506 2669 : }
507 :
508 : AMDGPUTargetMachine &getAMDGPUTargetMachine() const {
509 4485 : return getTM<AMDGPUTargetMachine>();
510 : }
511 :
512 : ScheduleDAGInstrs *
513 0 : createMachineScheduler(MachineSchedContext *C) const override {
514 0 : ScheduleDAGMILive *DAG = createGenericSchedLive(C);
515 0 : DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
516 0 : DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
517 0 : return DAG;
518 : }
519 :
520 : void addEarlyCSEOrGVNPass();
521 : void addStraightLineScalarOptimizationPasses();
522 : void addIRPasses() override;
523 : void addCodeGenPrepare() override;
524 : bool addPreISel() override;
525 : bool addInstSelector() override;
526 : bool addGCPasses() override;
527 : };
528 :
529 : class R600PassConfig final : public AMDGPUPassConfig {
530 : public:
531 : R600PassConfig(LLVMTargetMachine &TM, PassManagerBase &PM)
532 574 : : AMDGPUPassConfig(TM, PM) {}
533 :
534 2297 : ScheduleDAGInstrs *createMachineScheduler(
535 : MachineSchedContext *C) const override {
536 2297 : return createR600MachineScheduler(C);
537 : }
538 :
539 : bool addPreISel() override;
540 : bool addInstSelector() override;
541 : void addPreRegAlloc() override;
542 : void addPreSched2() override;
543 : void addPreEmitPass() override;
544 : };
545 :
546 : class GCNPassConfig final : public AMDGPUPassConfig {
547 : public:
548 2382 : GCNPassConfig(LLVMTargetMachine &TM, PassManagerBase &PM)
549 2382 : : AMDGPUPassConfig(TM, PM) {
550 : // It is necessary to know the register usage of the entire call graph. We
551 : // allow calls without EnableAMDGPUFunctionCalls if they are marked
552 : // noinline, so this is always required.
553 2382 : setRequiresCodeGenSCCOrder(true);
554 2382 : }
555 :
556 : GCNTargetMachine &getGCNTargetMachine() const {
557 : return getTM<GCNTargetMachine>();
558 : }
559 :
560 : ScheduleDAGInstrs *
561 : createMachineScheduler(MachineSchedContext *C) const override;
562 :
563 : bool addPreISel() override;
564 : void addMachineSSAOptimization() override;
565 : bool addILPOpts() override;
566 : bool addInstSelector() override;
567 : bool addIRTranslator() override;
568 : bool addLegalizeMachineIR() override;
569 : bool addRegBankSelect() override;
570 : bool addGlobalInstructionSelect() override;
571 : void addFastRegAlloc(FunctionPass *RegAllocPass) override;
572 : void addOptimizedRegAlloc(FunctionPass *RegAllocPass) override;
573 : void addPreRegAlloc() override;
574 : void addPostRegAlloc() override;
575 : void addPreSched2() override;
576 : void addPreEmitPass() override;
577 : };
578 :
579 : } // end anonymous namespace
580 :
581 4400 : void AMDGPUPassConfig::addEarlyCSEOrGVNPass() {
582 4400 : if (getOptLevel() == CodeGenOpt::Aggressive)
583 0 : addPass(createGVNPass());
584 : else
585 4400 : addPass(createEarlyCSEPass());
586 4400 : }
587 :
588 2200 : void AMDGPUPassConfig::addStraightLineScalarOptimizationPasses() {
589 2200 : addPass(createLICMPass());
590 2200 : addPass(createSeparateConstOffsetFromGEPPass());
591 2200 : addPass(createSpeculativeExecutionPass());
592 : // ReassociateGEPs exposes more opportunites for SLSR. See
593 : // the example in reassociate-geps-and-slsr.ll.
594 2200 : addPass(createStraightLineStrengthReducePass());
595 : // SeparateConstOffsetFromGEP and SLSR creates common expressions which GVN or
596 : // EarlyCSE can reuse.
597 2200 : addEarlyCSEOrGVNPass();
598 : // Run NaryReassociate after EarlyCSE/GVN to be more effective.
599 2200 : addPass(createNaryReassociatePass());
600 : // NaryReassociate on GEPs creates redundant common expressions, so run
601 : // EarlyCSE after it.
602 2200 : addPass(createEarlyCSEPass());
603 2200 : }
604 :
605 2246 : void AMDGPUPassConfig::addIRPasses() {
606 : const AMDGPUTargetMachine &TM = getAMDGPUTargetMachine();
607 :
608 : // There is no reason to run these.
609 2246 : disablePass(&StackMapLivenessID);
610 2246 : disablePass(&FuncletLayoutID);
611 2246 : disablePass(&PatchableFunctionID);
612 :
613 2246 : addPass(createAtomicExpandPass());
614 2246 : addPass(createAMDGPULowerIntrinsicsPass());
615 :
616 : // Function calls are not supported, so make sure we inline everything.
617 2246 : addPass(createAMDGPUAlwaysInlinePass());
618 2246 : addPass(createAlwaysInlinerLegacyPass());
619 : // We need to add the barrier noop pass, otherwise adding the function
620 : // inlining pass will cause all of the PassConfigs passes to be run
621 : // one function at a time, which means if we have a nodule with two
622 : // functions, then we will generate code for the first function
623 : // without ever running any passes on the second.
624 2246 : addPass(createBarrierNoopPass());
625 :
626 2246 : if (TM.getTargetTriple().getArch() == Triple::amdgcn) {
627 : // TODO: May want to move later or split into an early and late one.
628 :
629 1964 : addPass(createAMDGPUCodeGenPreparePass());
630 : }
631 :
632 : // Handle uses of OpenCL image2d_t, image3d_t and sampler_t arguments.
633 2246 : if (TM.getTargetTriple().getArch() == Triple::r600)
634 282 : addPass(createR600OpenCLImageTypeLoweringPass());
635 :
636 : // Replace OpenCL enqueued block function pointers with global variables.
637 2246 : addPass(createAMDGPUOpenCLEnqueuedBlockLoweringPass());
638 :
639 2246 : if (TM.getOptLevel() > CodeGenOpt::None) {
640 2200 : addPass(createInferAddressSpacesPass());
641 2200 : addPass(createAMDGPUPromoteAlloca());
642 :
643 2200 : if (EnableSROA)
644 2174 : addPass(createSROAPass());
645 :
646 2200 : addStraightLineScalarOptimizationPasses();
647 :
648 2200 : if (EnableAMDGPUAliasAnalysis) {
649 2187 : addPass(createAMDGPUAAWrapperPass());
650 4374 : addPass(createExternalAAWrapperPass([](Pass &P, Function &,
651 : AAResults &AAR) {
652 : if (auto *WrapperPass = P.getAnalysisIfAvailable<AMDGPUAAWrapperPass>())
653 : AAR.addAAResult(WrapperPass->getResult());
654 : }));
655 : }
656 : }
657 :
658 2246 : TargetPassConfig::addIRPasses();
659 :
660 : // EarlyCSE is not always strong enough to clean up what LSR produces. For
661 : // example, GVN can combine
662 : //
663 : // %0 = add %a, %b
664 : // %1 = add %b, %a
665 : //
666 : // and
667 : //
668 : // %0 = shl nsw %a, 2
669 : // %1 = shl %a, 2
670 : //
671 : // but EarlyCSE can do neither of them.
672 2246 : if (getOptLevel() != CodeGenOpt::None)
673 2200 : addEarlyCSEOrGVNPass();
674 2246 : }
675 :
676 2246 : void AMDGPUPassConfig::addCodeGenPrepare() {
677 2246 : if (TM->getTargetTriple().getArch() == Triple::amdgcn &&
678 : EnableLowerKernelArguments)
679 1962 : addPass(createAMDGPULowerKernelArgumentsPass());
680 :
681 2246 : TargetPassConfig::addCodeGenPrepare();
682 :
683 2246 : if (EnableLoadStoreVectorizer)
684 2235 : addPass(createLoadStoreVectorizerPass());
685 2246 : }
686 :
687 2246 : bool AMDGPUPassConfig::addPreISel() {
688 2246 : addPass(createLowerSwitchPass());
689 2246 : addPass(createFlattenCFGPass());
690 2246 : return false;
691 : }
692 :
693 1957 : bool AMDGPUPassConfig::addInstSelector() {
694 1957 : addPass(createAMDGPUISelDag(&getAMDGPUTargetMachine(), getOptLevel()));
695 1957 : return false;
696 : }
697 :
698 2246 : bool AMDGPUPassConfig::addGCPasses() {
699 : // Do nothing. GC is not supported.
700 2246 : return false;
701 : }
702 :
703 : //===----------------------------------------------------------------------===//
704 : // R600 Pass Setup
705 : //===----------------------------------------------------------------------===//
706 :
707 282 : bool R600PassConfig::addPreISel() {
708 282 : AMDGPUPassConfig::addPreISel();
709 :
710 282 : if (EnableR600StructurizeCFG)
711 280 : addPass(createStructurizeCFGPass());
712 282 : return false;
713 : }
714 :
715 282 : bool R600PassConfig::addInstSelector() {
716 282 : addPass(createR600ISelDag(&getAMDGPUTargetMachine(), getOptLevel()));
717 282 : return false;
718 : }
719 :
720 282 : void R600PassConfig::addPreRegAlloc() {
721 282 : addPass(createR600VectorRegMerger());
722 282 : }
723 :
724 282 : void R600PassConfig::addPreSched2() {
725 282 : addPass(createR600EmitClauseMarkers(), false);
726 282 : if (EnableR600IfConvert)
727 281 : addPass(&IfConverterID, false);
728 282 : addPass(createR600ClauseMergePass(), false);
729 282 : }
730 :
731 282 : void R600PassConfig::addPreEmitPass() {
732 282 : addPass(createAMDGPUCFGStructurizerPass(), false);
733 282 : addPass(createR600ExpandSpecialInstrsPass(), false);
734 282 : addPass(&FinalizeMachineBundlesID, false);
735 282 : addPass(createR600Packetizer(), false);
736 282 : addPass(createR600ControlFlowFinalizer(), false);
737 282 : }
738 :
739 287 : TargetPassConfig *R600TargetMachine::createPassConfig(PassManagerBase &PM) {
740 287 : return new R600PassConfig(*this, PM);
741 : }
742 :
743 : //===----------------------------------------------------------------------===//
744 : // GCN Pass Setup
745 : //===----------------------------------------------------------------------===//
746 :
747 19515 : ScheduleDAGInstrs *GCNPassConfig::createMachineScheduler(
748 : MachineSchedContext *C) const {
749 19515 : const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>();
750 19515 : if (ST.enableSIScheduler())
751 0 : return createSIMachineScheduler(C);
752 19515 : return createGCNMaxOccupancyMachineScheduler(C);
753 : }
754 :
755 1964 : bool GCNPassConfig::addPreISel() {
756 1964 : AMDGPUPassConfig::addPreISel();
757 :
758 1964 : if (EnableAtomicOptimizations) {
759 15 : addPass(createAMDGPUAtomicOptimizerPass());
760 : }
761 :
762 : // FIXME: We need to run a pass to propagate the attributes when calls are
763 : // supported.
764 1964 : addPass(createAMDGPUAnnotateKernelFeaturesPass());
765 :
766 : // Merge divergent exit nodes. StructurizeCFG won't recognize the multi-exit
767 : // regions formed by them.
768 1964 : addPass(&AMDGPUUnifyDivergentExitNodesID);
769 1964 : if (!LateCFGStructurize) {
770 1964 : addPass(createStructurizeCFGPass(true)); // true -> SkipUniformRegions
771 : }
772 1964 : addPass(createSinkingPass());
773 1964 : addPass(createAMDGPUAnnotateUniformValues());
774 1964 : if (!LateCFGStructurize) {
775 1964 : addPass(createSIAnnotateControlFlowPass());
776 : }
777 :
778 1964 : return false;
779 : }
780 :
781 1919 : void GCNPassConfig::addMachineSSAOptimization() {
782 1919 : TargetPassConfig::addMachineSSAOptimization();
783 :
784 : // We want to fold operands after PeepholeOptimizer has run (or as part of
785 : // it), because it will eliminate extra copies making it easier to fold the
786 : // real source operand. We want to eliminate dead instructions after, so that
787 : // we see fewer uses of the copies. We then need to clean up the dead
788 : // instructions leftover after the operands are folded as well.
789 : //
790 : // XXX - Can we get away without running DeadMachineInstructionElim again?
791 1919 : addPass(&SIFoldOperandsID);
792 1919 : addPass(&DeadMachineInstructionElimID);
793 1919 : addPass(&SILoadStoreOptimizerID);
794 1919 : if (EnableSDWAPeephole) {
795 1915 : addPass(&SIPeepholeSDWAID);
796 1915 : addPass(&EarlyMachineLICMID);
797 1915 : addPass(&MachineCSEID);
798 1915 : addPass(&SIFoldOperandsID);
799 1915 : addPass(&DeadMachineInstructionElimID);
800 : }
801 1919 : addPass(createSIShrinkInstructionsPass());
802 1919 : }
803 :
804 1919 : bool GCNPassConfig::addILPOpts() {
805 1919 : if (EnableEarlyIfConversion)
806 2 : addPass(&EarlyIfConverterID);
807 :
808 : TargetPassConfig::addILPOpts();
809 1919 : return false;
810 : }
811 :
812 1957 : bool GCNPassConfig::addInstSelector() {
813 1957 : AMDGPUPassConfig::addInstSelector();
814 1957 : addPass(createSILowerI1CopiesPass());
815 1957 : addPass(&SIFixSGPRCopiesID);
816 1957 : return false;
817 : }
818 :
819 8 : bool GCNPassConfig::addIRTranslator() {
820 8 : addPass(new IRTranslator());
821 8 : return false;
822 : }
823 :
824 8 : bool GCNPassConfig::addLegalizeMachineIR() {
825 8 : addPass(new Legalizer());
826 8 : return false;
827 : }
828 :
829 8 : bool GCNPassConfig::addRegBankSelect() {
830 8 : addPass(new RegBankSelect());
831 8 : return false;
832 : }
833 :
834 8 : bool GCNPassConfig::addGlobalInstructionSelect() {
835 8 : addPass(new InstructionSelect());
836 8 : return false;
837 : }
838 :
839 1964 : void GCNPassConfig::addPreRegAlloc() {
840 1964 : if (LateCFGStructurize) {
841 0 : addPass(createAMDGPUMachineCFGStructurizerPass());
842 : }
843 1964 : addPass(createSIWholeQuadModePass());
844 1964 : }
845 :
846 45 : void GCNPassConfig::addFastRegAlloc(FunctionPass *RegAllocPass) {
847 : // FIXME: We have to disable the verifier here because of PHIElimination +
848 : // TwoAddressInstructions disabling it.
849 :
850 : // This must be run immediately after phi elimination and before
851 : // TwoAddressInstructions, otherwise the processing of the tied operand of
852 : // SI_ELSE will introduce a copy of the tied operand source after the else.
853 90 : insertPass(&PHIEliminationID, &SILowerControlFlowID, false);
854 :
855 : // This must be run after SILowerControlFlow, since it needs to use the
856 : // machine-level CFG, but before register allocation.
857 90 : insertPass(&SILowerControlFlowID, &SIFixWWMLivenessID, false);
858 :
859 45 : TargetPassConfig::addFastRegAlloc(RegAllocPass);
860 45 : }
861 :
862 1919 : void GCNPassConfig::addOptimizedRegAlloc(FunctionPass *RegAllocPass) {
863 3838 : insertPass(&MachineSchedulerID, &SIOptimizeExecMaskingPreRAID);
864 :
865 3838 : insertPass(&SIOptimizeExecMaskingPreRAID, &SIFormMemoryClausesID);
866 :
867 : // This must be run immediately after phi elimination and before
868 : // TwoAddressInstructions, otherwise the processing of the tied operand of
869 : // SI_ELSE will introduce a copy of the tied operand source after the else.
870 3838 : insertPass(&PHIEliminationID, &SILowerControlFlowID, false);
871 :
872 : // This must be run after SILowerControlFlow, since it needs to use the
873 : // machine-level CFG, but before register allocation.
874 3838 : insertPass(&SILowerControlFlowID, &SIFixWWMLivenessID, false);
875 :
876 1919 : TargetPassConfig::addOptimizedRegAlloc(RegAllocPass);
877 1919 : }
878 :
879 1964 : void GCNPassConfig::addPostRegAlloc() {
880 1964 : addPass(&SIFixVGPRCopiesID);
881 1964 : addPass(&SIOptimizeExecMaskingID);
882 : TargetPassConfig::addPostRegAlloc();
883 1964 : }
884 :
885 1964 : void GCNPassConfig::addPreSched2() {
886 1964 : }
887 :
888 1964 : void GCNPassConfig::addPreEmitPass() {
889 1964 : addPass(createSIMemoryLegalizerPass());
890 1964 : addPass(createSIInsertWaitcntsPass());
891 1964 : addPass(createSIShrinkInstructionsPass());
892 :
893 : // The hazard recognizer that runs as part of the post-ra scheduler does not
894 : // guarantee to be able handle all hazards correctly. This is because if there
895 : // are multiple scheduling regions in a basic block, the regions are scheduled
896 : // bottom up, so when we begin to schedule a region we don't know what
897 : // instructions were emitted directly before it.
898 : //
899 : // Here we add a stand-alone hazard recognizer pass which can handle all
900 : // cases.
901 : //
902 : // FIXME: This stand-alone pass will emit indiv. S_NOP 0, as needed. It would
903 : // be better for it to emit S_NOP <N> when possible.
904 1964 : addPass(&PostRAHazardRecognizerID);
905 :
906 1964 : addPass(&SIInsertSkipsPassID);
907 1964 : addPass(createSIDebuggerInsertNopsPass());
908 1964 : addPass(&BranchRelaxationPassID);
909 1964 : }
910 :
911 2382 : TargetPassConfig *GCNTargetMachine::createPassConfig(PassManagerBase &PM) {
912 2382 : return new GCNPassConfig(*this, PM);
913 : }
|