| File: | build/source/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp |
| Warning: | line 120, column 5 Value stored to 'Ctor' is never read |
Press '?' to see keyboard shortcuts
Keyboard shortcuts:
| 1 | //===-- AMDGPUTargetMachine.cpp - TargetMachine for hw codegen targets-----===// |
| 2 | // |
| 3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| 4 | // See https://llvm.org/LICENSE.txt for license information. |
| 5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| 6 | // |
| 7 | //===----------------------------------------------------------------------===// |
| 8 | // |
| 9 | /// \file |
| 10 | /// The AMDGPU target machine contains all of the hardware specific |
| 11 | /// information needed to emit code for SI+ GPUs. |
| 12 | // |
| 13 | //===----------------------------------------------------------------------===// |
| 14 | |
| 15 | #include "AMDGPUTargetMachine.h" |
| 16 | #include "AMDGPU.h" |
| 17 | #include "AMDGPUAliasAnalysis.h" |
| 18 | #include "AMDGPUCtorDtorLowering.h" |
| 19 | #include "AMDGPUExportClustering.h" |
| 20 | #include "AMDGPUIGroupLP.h" |
| 21 | #include "AMDGPUMacroFusion.h" |
| 22 | #include "AMDGPURegBankSelect.h" |
| 23 | #include "AMDGPUTargetObjectFile.h" |
| 24 | #include "AMDGPUTargetTransformInfo.h" |
| 25 | #include "AMDGPUUnifyDivergentExitNodes.h" |
| 26 | #include "GCNIterativeScheduler.h" |
| 27 | #include "GCNSchedStrategy.h" |
| 28 | #include "GCNVOPDUtils.h" |
| 29 | #include "R600.h" |
| 30 | #include "R600MachineFunctionInfo.h" |
| 31 | #include "R600TargetMachine.h" |
| 32 | #include "SIMachineFunctionInfo.h" |
| 33 | #include "SIMachineScheduler.h" |
| 34 | #include "TargetInfo/AMDGPUTargetInfo.h" |
| 35 | #include "Utils/AMDGPUBaseInfo.h" |
| 36 | #include "llvm/Analysis/CGSCCPassManager.h" |
| 37 | #include "llvm/CodeGen/GlobalISel/CSEInfo.h" |
| 38 | #include "llvm/CodeGen/GlobalISel/IRTranslator.h" |
| 39 | #include "llvm/CodeGen/GlobalISel/InstructionSelect.h" |
| 40 | #include "llvm/CodeGen/GlobalISel/Legalizer.h" |
| 41 | #include "llvm/CodeGen/GlobalISel/Localizer.h" |
| 42 | #include "llvm/CodeGen/GlobalISel/RegBankSelect.h" |
| 43 | #include "llvm/CodeGen/MIRParser/MIParser.h" |
| 44 | #include "llvm/CodeGen/Passes.h" |
| 45 | #include "llvm/CodeGen/RegAllocRegistry.h" |
| 46 | #include "llvm/CodeGen/TargetPassConfig.h" |
| 47 | #include "llvm/IR/IntrinsicsAMDGPU.h" |
| 48 | #include "llvm/IR/PassManager.h" |
| 49 | #include "llvm/IR/PatternMatch.h" |
| 50 | #include "llvm/InitializePasses.h" |
| 51 | #include "llvm/MC/TargetRegistry.h" |
| 52 | #include "llvm/Passes/PassBuilder.h" |
| 53 | #include "llvm/Transforms/IPO.h" |
| 54 | #include "llvm/Transforms/IPO/AlwaysInliner.h" |
| 55 | #include "llvm/Transforms/IPO/GlobalDCE.h" |
| 56 | #include "llvm/Transforms/IPO/Internalize.h" |
| 57 | #include "llvm/Transforms/Scalar.h" |
| 58 | #include "llvm/Transforms/Scalar/GVN.h" |
| 59 | #include "llvm/Transforms/Scalar/InferAddressSpaces.h" |
| 60 | #include "llvm/Transforms/Utils.h" |
| 61 | #include "llvm/Transforms/Utils/SimplifyLibCalls.h" |
| 62 | #include "llvm/Transforms/Vectorize/LoadStoreVectorizer.h" |
| 63 | #include <optional> |
| 64 | |
| 65 | using namespace llvm; |
| 66 | using namespace llvm::PatternMatch; |
| 67 | |
| 68 | namespace { |
| 69 | class SGPRRegisterRegAlloc : public RegisterRegAllocBase<SGPRRegisterRegAlloc> { |
| 70 | public: |
| 71 | SGPRRegisterRegAlloc(const char *N, const char *D, FunctionPassCtor C) |
| 72 | : RegisterRegAllocBase(N, D, C) {} |
| 73 | }; |
| 74 | |
| 75 | class VGPRRegisterRegAlloc : public RegisterRegAllocBase<VGPRRegisterRegAlloc> { |
| 76 | public: |
| 77 | VGPRRegisterRegAlloc(const char *N, const char *D, FunctionPassCtor C) |
| 78 | : RegisterRegAllocBase(N, D, C) {} |
| 79 | }; |
| 80 | |
| 81 | static bool onlyAllocateSGPRs(const TargetRegisterInfo &TRI, |
| 82 | const TargetRegisterClass &RC) { |
| 83 | return static_cast<const SIRegisterInfo &>(TRI).isSGPRClass(&RC); |
| 84 | } |
| 85 | |
| 86 | static bool onlyAllocateVGPRs(const TargetRegisterInfo &TRI, |
| 87 | const TargetRegisterClass &RC) { |
| 88 | return !static_cast<const SIRegisterInfo &>(TRI).isSGPRClass(&RC); |
| 89 | } |
| 90 | |
| 91 | |
| 92 | /// -{sgpr|vgpr}-regalloc=... command line option. |
| 93 | static FunctionPass *useDefaultRegisterAllocator() { return nullptr; } |
| 94 | |
| 95 | /// A dummy default pass factory indicates whether the register allocator is |
| 96 | /// overridden on the command line. |
| 97 | static llvm::once_flag InitializeDefaultSGPRRegisterAllocatorFlag; |
| 98 | static llvm::once_flag InitializeDefaultVGPRRegisterAllocatorFlag; |
| 99 | |
| 100 | static SGPRRegisterRegAlloc |
| 101 | defaultSGPRRegAlloc("default", |
| 102 | "pick SGPR register allocator based on -O option", |
| 103 | useDefaultRegisterAllocator); |
| 104 | |
| 105 | static cl::opt<SGPRRegisterRegAlloc::FunctionPassCtor, false, |
| 106 | RegisterPassParser<SGPRRegisterRegAlloc>> |
| 107 | SGPRRegAlloc("sgpr-regalloc", cl::Hidden, cl::init(&useDefaultRegisterAllocator), |
| 108 | cl::desc("Register allocator to use for SGPRs")); |
| 109 | |
| 110 | static cl::opt<VGPRRegisterRegAlloc::FunctionPassCtor, false, |
| 111 | RegisterPassParser<VGPRRegisterRegAlloc>> |
| 112 | VGPRRegAlloc("vgpr-regalloc", cl::Hidden, cl::init(&useDefaultRegisterAllocator), |
| 113 | cl::desc("Register allocator to use for VGPRs")); |
| 114 | |
| 115 | |
| 116 | static void initializeDefaultSGPRRegisterAllocatorOnce() { |
| 117 | RegisterRegAlloc::FunctionPassCtor Ctor = SGPRRegisterRegAlloc::getDefault(); |
| 118 | |
| 119 | if (!Ctor) { |
| 120 | Ctor = SGPRRegAlloc; |
Value stored to 'Ctor' is never read | |
| 121 | SGPRRegisterRegAlloc::setDefault(SGPRRegAlloc); |
| 122 | } |
| 123 | } |
| 124 | |
| 125 | static void initializeDefaultVGPRRegisterAllocatorOnce() { |
| 126 | RegisterRegAlloc::FunctionPassCtor Ctor = VGPRRegisterRegAlloc::getDefault(); |
| 127 | |
| 128 | if (!Ctor) { |
| 129 | Ctor = VGPRRegAlloc; |
| 130 | VGPRRegisterRegAlloc::setDefault(VGPRRegAlloc); |
| 131 | } |
| 132 | } |
| 133 | |
| 134 | static FunctionPass *createBasicSGPRRegisterAllocator() { |
| 135 | return createBasicRegisterAllocator(onlyAllocateSGPRs); |
| 136 | } |
| 137 | |
| 138 | static FunctionPass *createGreedySGPRRegisterAllocator() { |
| 139 | return createGreedyRegisterAllocator(onlyAllocateSGPRs); |
| 140 | } |
| 141 | |
| 142 | static FunctionPass *createFastSGPRRegisterAllocator() { |
| 143 | return createFastRegisterAllocator(onlyAllocateSGPRs, false); |
| 144 | } |
| 145 | |
| 146 | static FunctionPass *createBasicVGPRRegisterAllocator() { |
| 147 | return createBasicRegisterAllocator(onlyAllocateVGPRs); |
| 148 | } |
| 149 | |
| 150 | static FunctionPass *createGreedyVGPRRegisterAllocator() { |
| 151 | return createGreedyRegisterAllocator(onlyAllocateVGPRs); |
| 152 | } |
| 153 | |
| 154 | static FunctionPass *createFastVGPRRegisterAllocator() { |
| 155 | return createFastRegisterAllocator(onlyAllocateVGPRs, true); |
| 156 | } |
| 157 | |
| 158 | static SGPRRegisterRegAlloc basicRegAllocSGPR( |
| 159 | "basic", "basic register allocator", createBasicSGPRRegisterAllocator); |
| 160 | static SGPRRegisterRegAlloc greedyRegAllocSGPR( |
| 161 | "greedy", "greedy register allocator", createGreedySGPRRegisterAllocator); |
| 162 | |
| 163 | static SGPRRegisterRegAlloc fastRegAllocSGPR( |
| 164 | "fast", "fast register allocator", createFastSGPRRegisterAllocator); |
| 165 | |
| 166 | |
| 167 | static VGPRRegisterRegAlloc basicRegAllocVGPR( |
| 168 | "basic", "basic register allocator", createBasicVGPRRegisterAllocator); |
| 169 | static VGPRRegisterRegAlloc greedyRegAllocVGPR( |
| 170 | "greedy", "greedy register allocator", createGreedyVGPRRegisterAllocator); |
| 171 | |
| 172 | static VGPRRegisterRegAlloc fastRegAllocVGPR( |
| 173 | "fast", "fast register allocator", createFastVGPRRegisterAllocator); |
| 174 | } |
| 175 | |
| 176 | static cl::opt<bool> EnableSROA( |
| 177 | "amdgpu-sroa", |
| 178 | cl::desc("Run SROA after promote alloca pass"), |
| 179 | cl::ReallyHidden, |
| 180 | cl::init(true)); |
| 181 | |
| 182 | static cl::opt<bool> |
| 183 | EnableEarlyIfConversion("amdgpu-early-ifcvt", cl::Hidden, |
| 184 | cl::desc("Run early if-conversion"), |
| 185 | cl::init(false)); |
| 186 | |
| 187 | static cl::opt<bool> |
| 188 | OptExecMaskPreRA("amdgpu-opt-exec-mask-pre-ra", cl::Hidden, |
| 189 | cl::desc("Run pre-RA exec mask optimizations"), |
| 190 | cl::init(true)); |
| 191 | |
| 192 | // Option to disable vectorizer for tests. |
| 193 | static cl::opt<bool> EnableLoadStoreVectorizer( |
| 194 | "amdgpu-load-store-vectorizer", |
| 195 | cl::desc("Enable load store vectorizer"), |
| 196 | cl::init(true), |
| 197 | cl::Hidden); |
| 198 | |
| 199 | // Option to control global loads scalarization |
| 200 | static cl::opt<bool> ScalarizeGlobal( |
| 201 | "amdgpu-scalarize-global-loads", |
| 202 | cl::desc("Enable global load scalarization"), |
| 203 | cl::init(true), |
| 204 | cl::Hidden); |
| 205 | |
| 206 | // Option to run internalize pass. |
| 207 | static cl::opt<bool> InternalizeSymbols( |
| 208 | "amdgpu-internalize-symbols", |
| 209 | cl::desc("Enable elimination of non-kernel functions and unused globals"), |
| 210 | cl::init(false), |
| 211 | cl::Hidden); |
| 212 | |
| 213 | // Option to inline all early. |
| 214 | static cl::opt<bool> EarlyInlineAll( |
| 215 | "amdgpu-early-inline-all", |
| 216 | cl::desc("Inline all functions early"), |
| 217 | cl::init(false), |
| 218 | cl::Hidden); |
| 219 | |
| 220 | static cl::opt<bool> RemoveIncompatibleFunctions( |
| 221 | "amdgpu-enable-remove-incompatible-functions", cl::Hidden, |
| 222 | cl::desc("Enable removal of functions when they" |
| 223 | "use features not supported by the target GPU"), |
| 224 | cl::init(true)); |
| 225 | |
| 226 | static cl::opt<bool> EnableSDWAPeephole( |
| 227 | "amdgpu-sdwa-peephole", |
| 228 | cl::desc("Enable SDWA peepholer"), |
| 229 | cl::init(true)); |
| 230 | |
| 231 | static cl::opt<bool> EnableDPPCombine( |
| 232 | "amdgpu-dpp-combine", |
| 233 | cl::desc("Enable DPP combiner"), |
| 234 | cl::init(true)); |
| 235 | |
| 236 | // Enable address space based alias analysis |
| 237 | static cl::opt<bool> EnableAMDGPUAliasAnalysis("enable-amdgpu-aa", cl::Hidden, |
| 238 | cl::desc("Enable AMDGPU Alias Analysis"), |
| 239 | cl::init(true)); |
| 240 | |
| 241 | // Option to run late CFG structurizer |
| 242 | static cl::opt<bool, true> LateCFGStructurize( |
| 243 | "amdgpu-late-structurize", |
| 244 | cl::desc("Enable late CFG structurization"), |
| 245 | cl::location(AMDGPUTargetMachine::EnableLateStructurizeCFG), |
| 246 | cl::Hidden); |
| 247 | |
| 248 | // Enable lib calls simplifications |
| 249 | static cl::opt<bool> EnableLibCallSimplify( |
| 250 | "amdgpu-simplify-libcall", |
| 251 | cl::desc("Enable amdgpu library simplifications"), |
| 252 | cl::init(true), |
| 253 | cl::Hidden); |
| 254 | |
| 255 | static cl::opt<bool> EnableLowerKernelArguments( |
| 256 | "amdgpu-ir-lower-kernel-arguments", |
| 257 | cl::desc("Lower kernel argument loads in IR pass"), |
| 258 | cl::init(true), |
| 259 | cl::Hidden); |
| 260 | |
| 261 | static cl::opt<bool> EnableRegReassign( |
| 262 | "amdgpu-reassign-regs", |
| 263 | cl::desc("Enable register reassign optimizations on gfx10+"), |
| 264 | cl::init(true), |
| 265 | cl::Hidden); |
| 266 | |
| 267 | static cl::opt<bool> OptVGPRLiveRange( |
| 268 | "amdgpu-opt-vgpr-liverange", |
| 269 | cl::desc("Enable VGPR liverange optimizations for if-else structure"), |
| 270 | cl::init(true), cl::Hidden); |
| 271 | |
| 272 | // Enable atomic optimization |
| 273 | static cl::opt<bool> EnableAtomicOptimizations( |
| 274 | "amdgpu-atomic-optimizations", |
| 275 | cl::desc("Enable atomic optimizations"), |
| 276 | cl::init(false), |
| 277 | cl::Hidden); |
| 278 | |
| 279 | // Enable Mode register optimization |
| 280 | static cl::opt<bool> EnableSIModeRegisterPass( |
| 281 | "amdgpu-mode-register", |
| 282 | cl::desc("Enable mode register pass"), |
| 283 | cl::init(true), |
| 284 | cl::Hidden); |
| 285 | |
| 286 | // Enable GFX11+ s_delay_alu insertion |
| 287 | static cl::opt<bool> |
| 288 | EnableInsertDelayAlu("amdgpu-enable-delay-alu", |
| 289 | cl::desc("Enable s_delay_alu insertion"), |
| 290 | cl::init(true), cl::Hidden); |
| 291 | |
| 292 | // Enable GFX11+ VOPD |
| 293 | static cl::opt<bool> |
| 294 | EnableVOPD("amdgpu-enable-vopd", |
| 295 | cl::desc("Enable VOPD, dual issue of VALU in wave32"), |
| 296 | cl::init(true), cl::Hidden); |
| 297 | |
| 298 | // Option is used in lit tests to prevent deadcoding of patterns inspected. |
| 299 | static cl::opt<bool> |
| 300 | EnableDCEInRA("amdgpu-dce-in-ra", |
| 301 | cl::init(true), cl::Hidden, |
| 302 | cl::desc("Enable machine DCE inside regalloc")); |
| 303 | |
| 304 | static cl::opt<bool> EnableSetWavePriority("amdgpu-set-wave-priority", |
| 305 | cl::desc("Adjust wave priority"), |
| 306 | cl::init(false), cl::Hidden); |
| 307 | |
| 308 | static cl::opt<bool> EnableScalarIRPasses( |
| 309 | "amdgpu-scalar-ir-passes", |
| 310 | cl::desc("Enable scalar IR passes"), |
| 311 | cl::init(true), |
| 312 | cl::Hidden); |
| 313 | |
| 314 | static cl::opt<bool> EnableStructurizerWorkarounds( |
| 315 | "amdgpu-enable-structurizer-workarounds", |
| 316 | cl::desc("Enable workarounds for the StructurizeCFG pass"), cl::init(true), |
| 317 | cl::Hidden); |
| 318 | |
| 319 | static cl::opt<bool, true> EnableLowerModuleLDS( |
| 320 | "amdgpu-enable-lower-module-lds", cl::desc("Enable lower module lds pass"), |
| 321 | cl::location(AMDGPUTargetMachine::EnableLowerModuleLDS), cl::init(true), |
| 322 | cl::Hidden); |
| 323 | |
| 324 | static cl::opt<bool> EnablePreRAOptimizations( |
| 325 | "amdgpu-enable-pre-ra-optimizations", |
| 326 | cl::desc("Enable Pre-RA optimizations pass"), cl::init(true), |
| 327 | cl::Hidden); |
| 328 | |
| 329 | static cl::opt<bool> EnablePromoteKernelArguments( |
| 330 | "amdgpu-enable-promote-kernel-arguments", |
| 331 | cl::desc("Enable promotion of flat kernel pointer arguments to global"), |
| 332 | cl::Hidden, cl::init(true)); |
| 333 | |
| 334 | static cl::opt<bool> EnableMaxIlpSchedStrategy( |
| 335 | "amdgpu-enable-max-ilp-scheduling-strategy", |
| 336 | cl::desc("Enable scheduling strategy to maximize ILP for a single wave."), |
| 337 | cl::Hidden, cl::init(false)); |
| 338 | |
| 339 | extern "C" LLVM_EXTERNAL_VISIBILITY__attribute__((visibility("default"))) void LLVMInitializeAMDGPUTarget() { |
| 340 | // Register the target |
| 341 | RegisterTargetMachine<R600TargetMachine> X(getTheAMDGPUTarget()); |
| 342 | RegisterTargetMachine<GCNTargetMachine> Y(getTheGCNTarget()); |
| 343 | |
| 344 | PassRegistry *PR = PassRegistry::getPassRegistry(); |
| 345 | initializeR600ClauseMergePassPass(*PR); |
| 346 | initializeR600ControlFlowFinalizerPass(*PR); |
| 347 | initializeR600PacketizerPass(*PR); |
| 348 | initializeR600ExpandSpecialInstrsPassPass(*PR); |
| 349 | initializeR600VectorRegMergerPass(*PR); |
| 350 | initializeGlobalISel(*PR); |
| 351 | initializeAMDGPUDAGToDAGISelPass(*PR); |
| 352 | initializeGCNDPPCombinePass(*PR); |
| 353 | initializeSILowerI1CopiesPass(*PR); |
| 354 | initializeSILowerSGPRSpillsPass(*PR); |
| 355 | initializeSIFixSGPRCopiesPass(*PR); |
| 356 | initializeSIFixVGPRCopiesPass(*PR); |
| 357 | initializeSIFoldOperandsPass(*PR); |
| 358 | initializeSIPeepholeSDWAPass(*PR); |
| 359 | initializeSIShrinkInstructionsPass(*PR); |
| 360 | initializeSIOptimizeExecMaskingPreRAPass(*PR); |
| 361 | initializeSIOptimizeVGPRLiveRangePass(*PR); |
| 362 | initializeSILoadStoreOptimizerPass(*PR); |
| 363 | initializeAMDGPUCtorDtorLoweringLegacyPass(*PR); |
| 364 | initializeAMDGPUAlwaysInlinePass(*PR); |
| 365 | initializeAMDGPUAttributorPass(*PR); |
| 366 | initializeAMDGPUAnnotateKernelFeaturesPass(*PR); |
| 367 | initializeAMDGPUAnnotateUniformValuesPass(*PR); |
| 368 | initializeAMDGPUArgumentUsageInfoPass(*PR); |
| 369 | initializeAMDGPUAtomicOptimizerPass(*PR); |
| 370 | initializeAMDGPULowerKernelArgumentsPass(*PR); |
| 371 | initializeAMDGPUPromoteKernelArgumentsPass(*PR); |
| 372 | initializeAMDGPULowerKernelAttributesPass(*PR); |
| 373 | initializeAMDGPULowerIntrinsicsPass(*PR); |
| 374 | initializeAMDGPUOpenCLEnqueuedBlockLoweringPass(*PR); |
| 375 | initializeAMDGPUPostLegalizerCombinerPass(*PR); |
| 376 | initializeAMDGPUPreLegalizerCombinerPass(*PR); |
| 377 | initializeAMDGPURegBankCombinerPass(*PR); |
| 378 | initializeAMDGPURegBankSelectPass(*PR); |
| 379 | initializeAMDGPUPromoteAllocaPass(*PR); |
| 380 | initializeAMDGPUPromoteAllocaToVectorPass(*PR); |
| 381 | initializeAMDGPUCodeGenPreparePass(*PR); |
| 382 | initializeAMDGPULateCodeGenPreparePass(*PR); |
| 383 | initializeAMDGPUPropagateAttributesEarlyPass(*PR); |
| 384 | initializeAMDGPUPropagateAttributesLatePass(*PR); |
| 385 | initializeAMDGPURemoveIncompatibleFunctionsPass(*PR); |
| 386 | initializeAMDGPULowerModuleLDSPass(*PR); |
| 387 | initializeAMDGPURewriteOutArgumentsPass(*PR); |
| 388 | initializeAMDGPURewriteUndefForPHIPass(*PR); |
| 389 | initializeAMDGPUUnifyMetadataPass(*PR); |
| 390 | initializeSIAnnotateControlFlowPass(*PR); |
| 391 | initializeAMDGPUReleaseVGPRsPass(*PR); |
| 392 | initializeAMDGPUInsertDelayAluPass(*PR); |
| 393 | initializeSIInsertHardClausesPass(*PR); |
| 394 | initializeSIInsertWaitcntsPass(*PR); |
| 395 | initializeSIModeRegisterPass(*PR); |
| 396 | initializeSIWholeQuadModePass(*PR); |
| 397 | initializeSILowerControlFlowPass(*PR); |
| 398 | initializeSIPreEmitPeepholePass(*PR); |
| 399 | initializeSILateBranchLoweringPass(*PR); |
| 400 | initializeSIMemoryLegalizerPass(*PR); |
| 401 | initializeSIOptimizeExecMaskingPass(*PR); |
| 402 | initializeSIPreAllocateWWMRegsPass(*PR); |
| 403 | initializeSIFormMemoryClausesPass(*PR); |
| 404 | initializeSIPostRABundlerPass(*PR); |
| 405 | initializeGCNCreateVOPDPass(*PR); |
| 406 | initializeAMDGPUUnifyDivergentExitNodesPass(*PR); |
| 407 | initializeAMDGPUAAWrapperPassPass(*PR); |
| 408 | initializeAMDGPUExternalAAWrapperPass(*PR); |
| 409 | initializeAMDGPUUseNativeCallsPass(*PR); |
| 410 | initializeAMDGPUSimplifyLibCallsPass(*PR); |
| 411 | initializeAMDGPUPrintfRuntimeBindingPass(*PR); |
| 412 | initializeAMDGPUResourceUsageAnalysisPass(*PR); |
| 413 | initializeGCNNSAReassignPass(*PR); |
| 414 | initializeGCNPreRAOptimizationsPass(*PR); |
| 415 | } |
| 416 | |
| 417 | static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) { |
| 418 | return std::make_unique<AMDGPUTargetObjectFile>(); |
| 419 | } |
| 420 | |
| 421 | static ScheduleDAGInstrs *createSIMachineScheduler(MachineSchedContext *C) { |
| 422 | return new SIScheduleDAGMI(C); |
| 423 | } |
| 424 | |
| 425 | static ScheduleDAGInstrs * |
| 426 | createGCNMaxOccupancyMachineScheduler(MachineSchedContext *C) { |
| 427 | const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>(); |
| 428 | ScheduleDAGMILive *DAG = |
| 429 | new GCNScheduleDAGMILive(C, std::make_unique<GCNMaxOccupancySchedStrategy>(C)); |
| 430 | DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI)); |
| 431 | if (ST.shouldClusterStores()) |
| 432 | DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI)); |
| 433 | DAG->addMutation(createIGroupLPDAGMutation()); |
| 434 | DAG->addMutation(createAMDGPUMacroFusionDAGMutation()); |
| 435 | DAG->addMutation(createAMDGPUExportClusteringDAGMutation()); |
| 436 | return DAG; |
| 437 | } |
| 438 | |
| 439 | static ScheduleDAGInstrs * |
| 440 | createGCNMaxILPMachineScheduler(MachineSchedContext *C) { |
| 441 | ScheduleDAGMILive *DAG = |
| 442 | new GCNScheduleDAGMILive(C, std::make_unique<GCNMaxILPSchedStrategy>(C)); |
| 443 | DAG->addMutation(createIGroupLPDAGMutation()); |
| 444 | return DAG; |
| 445 | } |
| 446 | |
| 447 | static ScheduleDAGInstrs * |
| 448 | createIterativeGCNMaxOccupancyMachineScheduler(MachineSchedContext *C) { |
| 449 | const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>(); |
| 450 | auto DAG = new GCNIterativeScheduler(C, |
| 451 | GCNIterativeScheduler::SCHEDULE_LEGACYMAXOCCUPANCY); |
| 452 | DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI)); |
| 453 | if (ST.shouldClusterStores()) |
| 454 | DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI)); |
| 455 | return DAG; |
| 456 | } |
| 457 | |
| 458 | static ScheduleDAGInstrs *createMinRegScheduler(MachineSchedContext *C) { |
| 459 | return new GCNIterativeScheduler(C, |
| 460 | GCNIterativeScheduler::SCHEDULE_MINREGFORCED); |
| 461 | } |
| 462 | |
| 463 | static ScheduleDAGInstrs * |
| 464 | createIterativeILPMachineScheduler(MachineSchedContext *C) { |
| 465 | const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>(); |
| 466 | auto DAG = new GCNIterativeScheduler(C, |
| 467 | GCNIterativeScheduler::SCHEDULE_ILP); |
| 468 | DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI)); |
| 469 | if (ST.shouldClusterStores()) |
| 470 | DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI)); |
| 471 | DAG->addMutation(createAMDGPUMacroFusionDAGMutation()); |
| 472 | return DAG; |
| 473 | } |
| 474 | |
| 475 | static MachineSchedRegistry |
| 476 | SISchedRegistry("si", "Run SI's custom scheduler", |
| 477 | createSIMachineScheduler); |
| 478 | |
| 479 | static MachineSchedRegistry |
| 480 | GCNMaxOccupancySchedRegistry("gcn-max-occupancy", |
| 481 | "Run GCN scheduler to maximize occupancy", |
| 482 | createGCNMaxOccupancyMachineScheduler); |
| 483 | |
| 484 | static MachineSchedRegistry |
| 485 | GCNMaxILPSchedRegistry("gcn-max-ilp", "Run GCN scheduler to maximize ilp", |
| 486 | createGCNMaxILPMachineScheduler); |
| 487 | |
| 488 | static MachineSchedRegistry IterativeGCNMaxOccupancySchedRegistry( |
| 489 | "gcn-iterative-max-occupancy-experimental", |
| 490 | "Run GCN scheduler to maximize occupancy (experimental)", |
| 491 | createIterativeGCNMaxOccupancyMachineScheduler); |
| 492 | |
| 493 | static MachineSchedRegistry GCNMinRegSchedRegistry( |
| 494 | "gcn-iterative-minreg", |
| 495 | "Run GCN iterative scheduler for minimal register usage (experimental)", |
| 496 | createMinRegScheduler); |
| 497 | |
| 498 | static MachineSchedRegistry GCNILPSchedRegistry( |
| 499 | "gcn-iterative-ilp", |
| 500 | "Run GCN iterative scheduler for ILP scheduling (experimental)", |
| 501 | createIterativeILPMachineScheduler); |
| 502 | |
| 503 | static StringRef computeDataLayout(const Triple &TT) { |
| 504 | if (TT.getArch() == Triple::r600) { |
| 505 | // 32-bit pointers. |
| 506 | return "e-p:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128" |
| 507 | "-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1"; |
| 508 | } |
| 509 | |
| 510 | // 32-bit private, local, and region pointers. 64-bit global, constant and |
| 511 | // flat. 160-bit non-integral fat buffer pointers that include a 128-bit |
| 512 | // buffer descriptor and a 32-bit offset, which are indexed by 32-bit values |
| 513 | // (address space 7), and 128-bit non-integral buffer resourcees (address |
| 514 | // space 8) which cannot be non-trivilally accessed by LLVM memory operations |
| 515 | // like getelementptr. |
| 516 | return "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32" |
| 517 | "-p7:160:256:256:32-p8:128:128-i64:64-v16:16-v24:32-v32:32-v48:64-v96:" |
| 518 | "128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-" |
| 519 | "G1-ni:7:8"; |
| 520 | } |
| 521 | |
| 522 | LLVM_READNONE__attribute__((__const__)) |
| 523 | static StringRef getGPUOrDefault(const Triple &TT, StringRef GPU) { |
| 524 | if (!GPU.empty()) |
| 525 | return GPU; |
| 526 | |
| 527 | // Need to default to a target with flat support for HSA. |
| 528 | if (TT.getArch() == Triple::amdgcn) |
| 529 | return TT.getOS() == Triple::AMDHSA ? "generic-hsa" : "generic"; |
| 530 | |
| 531 | return "r600"; |
| 532 | } |
| 533 | |
| 534 | static Reloc::Model getEffectiveRelocModel(std::optional<Reloc::Model> RM) { |
| 535 | // The AMDGPU toolchain only supports generating shared objects, so we |
| 536 | // must always use PIC. |
| 537 | return Reloc::PIC_; |
| 538 | } |
| 539 | |
| 540 | AMDGPUTargetMachine::AMDGPUTargetMachine(const Target &T, const Triple &TT, |
| 541 | StringRef CPU, StringRef FS, |
| 542 | TargetOptions Options, |
| 543 | std::optional<Reloc::Model> RM, |
| 544 | std::optional<CodeModel::Model> CM, |
| 545 | CodeGenOpt::Level OptLevel) |
| 546 | : LLVMTargetMachine(T, computeDataLayout(TT), TT, getGPUOrDefault(TT, CPU), |
| 547 | FS, Options, getEffectiveRelocModel(RM), |
| 548 | getEffectiveCodeModel(CM, CodeModel::Small), OptLevel), |
| 549 | TLOF(createTLOF(getTargetTriple())) { |
| 550 | initAsmInfo(); |
| 551 | if (TT.getArch() == Triple::amdgcn) { |
| 552 | if (getMCSubtargetInfo()->checkFeatures("+wavefrontsize64")) |
| 553 | MRI.reset(llvm::createGCNMCRegisterInfo(AMDGPUDwarfFlavour::Wave64)); |
| 554 | else if (getMCSubtargetInfo()->checkFeatures("+wavefrontsize32")) |
| 555 | MRI.reset(llvm::createGCNMCRegisterInfo(AMDGPUDwarfFlavour::Wave32)); |
| 556 | } |
| 557 | } |
| 558 | |
| 559 | bool AMDGPUTargetMachine::EnableLateStructurizeCFG = false; |
| 560 | bool AMDGPUTargetMachine::EnableFunctionCalls = false; |
| 561 | bool AMDGPUTargetMachine::EnableLowerModuleLDS = true; |
| 562 | |
| 563 | AMDGPUTargetMachine::~AMDGPUTargetMachine() = default; |
| 564 | |
| 565 | StringRef AMDGPUTargetMachine::getGPUName(const Function &F) const { |
| 566 | Attribute GPUAttr = F.getFnAttribute("target-cpu"); |
| 567 | return GPUAttr.isValid() ? GPUAttr.getValueAsString() : getTargetCPU(); |
| 568 | } |
| 569 | |
| 570 | StringRef AMDGPUTargetMachine::getFeatureString(const Function &F) const { |
| 571 | Attribute FSAttr = F.getFnAttribute("target-features"); |
| 572 | |
| 573 | return FSAttr.isValid() ? FSAttr.getValueAsString() |
| 574 | : getTargetFeatureString(); |
| 575 | } |
| 576 | |
| 577 | /// Predicate for Internalize pass. |
| 578 | static bool mustPreserveGV(const GlobalValue &GV) { |
| 579 | if (const Function *F = dyn_cast<Function>(&GV)) |
| 580 | return F->isDeclaration() || F->getName().startswith("__asan_") || |
| 581 | F->getName().startswith("__sanitizer_") || |
| 582 | AMDGPU::isEntryFunctionCC(F->getCallingConv()); |
| 583 | |
| 584 | GV.removeDeadConstantUsers(); |
| 585 | return !GV.use_empty(); |
| 586 | } |
| 587 | |
| 588 | void AMDGPUTargetMachine::registerDefaultAliasAnalyses(AAManager &AAM) { |
| 589 | AAM.registerFunctionAnalysis<AMDGPUAA>(); |
| 590 | } |
| 591 | |
| 592 | void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) { |
| 593 | PB.registerPipelineParsingCallback( |
| 594 | [this](StringRef PassName, ModulePassManager &PM, |
| 595 | ArrayRef<PassBuilder::PipelineElement>) { |
| 596 | if (PassName == "amdgpu-propagate-attributes-late") { |
| 597 | PM.addPass(AMDGPUPropagateAttributesLatePass(*this)); |
| 598 | return true; |
| 599 | } |
| 600 | if (PassName == "amdgpu-unify-metadata") { |
| 601 | PM.addPass(AMDGPUUnifyMetadataPass()); |
| 602 | return true; |
| 603 | } |
| 604 | if (PassName == "amdgpu-printf-runtime-binding") { |
| 605 | PM.addPass(AMDGPUPrintfRuntimeBindingPass()); |
| 606 | return true; |
| 607 | } |
| 608 | if (PassName == "amdgpu-always-inline") { |
| 609 | PM.addPass(AMDGPUAlwaysInlinePass()); |
| 610 | return true; |
| 611 | } |
| 612 | if (PassName == "amdgpu-lower-module-lds") { |
| 613 | PM.addPass(AMDGPULowerModuleLDSPass()); |
| 614 | return true; |
| 615 | } |
| 616 | if (PassName == "amdgpu-lower-ctor-dtor") { |
| 617 | PM.addPass(AMDGPUCtorDtorLoweringPass()); |
| 618 | return true; |
| 619 | } |
| 620 | return false; |
| 621 | }); |
| 622 | PB.registerPipelineParsingCallback( |
| 623 | [this](StringRef PassName, FunctionPassManager &PM, |
| 624 | ArrayRef<PassBuilder::PipelineElement>) { |
| 625 | if (PassName == "amdgpu-simplifylib") { |
| 626 | PM.addPass(AMDGPUSimplifyLibCallsPass(*this)); |
| 627 | return true; |
| 628 | } |
| 629 | if (PassName == "amdgpu-usenative") { |
| 630 | PM.addPass(AMDGPUUseNativeCallsPass()); |
| 631 | return true; |
| 632 | } |
| 633 | if (PassName == "amdgpu-promote-alloca") { |
| 634 | PM.addPass(AMDGPUPromoteAllocaPass(*this)); |
| 635 | return true; |
| 636 | } |
| 637 | if (PassName == "amdgpu-promote-alloca-to-vector") { |
| 638 | PM.addPass(AMDGPUPromoteAllocaToVectorPass(*this)); |
| 639 | return true; |
| 640 | } |
| 641 | if (PassName == "amdgpu-lower-kernel-attributes") { |
| 642 | PM.addPass(AMDGPULowerKernelAttributesPass()); |
| 643 | return true; |
| 644 | } |
| 645 | if (PassName == "amdgpu-propagate-attributes-early") { |
| 646 | PM.addPass(AMDGPUPropagateAttributesEarlyPass(*this)); |
| 647 | return true; |
| 648 | } |
| 649 | if (PassName == "amdgpu-promote-kernel-arguments") { |
| 650 | PM.addPass(AMDGPUPromoteKernelArgumentsPass()); |
| 651 | return true; |
| 652 | } |
| 653 | if (PassName == "amdgpu-unify-divergent-exit-nodes") { |
| 654 | PM.addPass(AMDGPUUnifyDivergentExitNodesPass()); |
| 655 | return true; |
| 656 | } |
| 657 | if (PassName == "amdgpu-atomic-optimizer") { |
| 658 | PM.addPass(AMDGPUAtomicOptimizerPass(*this)); |
| 659 | return true; |
| 660 | } |
| 661 | return false; |
| 662 | }); |
| 663 | |
| 664 | PB.registerAnalysisRegistrationCallback([](FunctionAnalysisManager &FAM) { |
| 665 | FAM.registerPass([&] { return AMDGPUAA(); }); |
| 666 | }); |
| 667 | |
| 668 | PB.registerParseAACallback([](StringRef AAName, AAManager &AAM) { |
| 669 | if (AAName == "amdgpu-aa") { |
| 670 | AAM.registerFunctionAnalysis<AMDGPUAA>(); |
| 671 | return true; |
| 672 | } |
| 673 | return false; |
| 674 | }); |
| 675 | |
| 676 | PB.registerPipelineStartEPCallback( |
| 677 | [this](ModulePassManager &PM, OptimizationLevel Level) { |
| 678 | FunctionPassManager FPM; |
| 679 | FPM.addPass(AMDGPUPropagateAttributesEarlyPass(*this)); |
| 680 | FPM.addPass(AMDGPUUseNativeCallsPass()); |
| 681 | if (EnableLibCallSimplify && Level != OptimizationLevel::O0) |
| 682 | FPM.addPass(AMDGPUSimplifyLibCallsPass(*this)); |
| 683 | PM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM))); |
| 684 | }); |
| 685 | |
| 686 | PB.registerPipelineEarlySimplificationEPCallback( |
| 687 | [this](ModulePassManager &PM, OptimizationLevel Level) { |
| 688 | PM.addPass(AMDGPUPrintfRuntimeBindingPass()); |
| 689 | |
| 690 | if (Level == OptimizationLevel::O0) |
| 691 | return; |
| 692 | |
| 693 | PM.addPass(AMDGPUUnifyMetadataPass()); |
| 694 | |
| 695 | if (InternalizeSymbols) { |
| 696 | PM.addPass(InternalizePass(mustPreserveGV)); |
| 697 | } |
| 698 | PM.addPass(AMDGPUPropagateAttributesLatePass(*this)); |
| 699 | if (InternalizeSymbols) { |
| 700 | PM.addPass(GlobalDCEPass()); |
| 701 | } |
| 702 | if (EarlyInlineAll && !EnableFunctionCalls) |
| 703 | PM.addPass(AMDGPUAlwaysInlinePass()); |
| 704 | }); |
| 705 | |
| 706 | PB.registerCGSCCOptimizerLateEPCallback( |
| 707 | [this](CGSCCPassManager &PM, OptimizationLevel Level) { |
| 708 | if (Level == OptimizationLevel::O0) |
| 709 | return; |
| 710 | |
| 711 | FunctionPassManager FPM; |
| 712 | |
| 713 | // Add promote kernel arguments pass to the opt pipeline right before |
| 714 | // infer address spaces which is needed to do actual address space |
| 715 | // rewriting. |
| 716 | if (Level.getSpeedupLevel() > OptimizationLevel::O1.getSpeedupLevel() && |
| 717 | EnablePromoteKernelArguments) |
| 718 | FPM.addPass(AMDGPUPromoteKernelArgumentsPass()); |
| 719 | |
| 720 | // Add infer address spaces pass to the opt pipeline after inlining |
| 721 | // but before SROA to increase SROA opportunities. |
| 722 | FPM.addPass(InferAddressSpacesPass()); |
| 723 | |
| 724 | // This should run after inlining to have any chance of doing |
| 725 | // anything, and before other cleanup optimizations. |
| 726 | FPM.addPass(AMDGPULowerKernelAttributesPass()); |
| 727 | |
| 728 | if (Level != OptimizationLevel::O0) { |
| 729 | // Promote alloca to vector before SROA and loop unroll. If we |
| 730 | // manage to eliminate allocas before unroll we may choose to unroll |
| 731 | // less. |
| 732 | FPM.addPass(AMDGPUPromoteAllocaToVectorPass(*this)); |
| 733 | } |
| 734 | |
| 735 | PM.addPass(createCGSCCToFunctionPassAdaptor(std::move(FPM))); |
| 736 | }); |
| 737 | } |
| 738 | |
| 739 | int64_t AMDGPUTargetMachine::getNullPointerValue(unsigned AddrSpace) { |
| 740 | return (AddrSpace == AMDGPUAS::LOCAL_ADDRESS || |
| 741 | AddrSpace == AMDGPUAS::PRIVATE_ADDRESS || |
| 742 | AddrSpace == AMDGPUAS::REGION_ADDRESS) |
| 743 | ? -1 |
| 744 | : 0; |
| 745 | } |
| 746 | |
| 747 | bool AMDGPUTargetMachine::isNoopAddrSpaceCast(unsigned SrcAS, |
| 748 | unsigned DestAS) const { |
| 749 | return AMDGPU::isFlatGlobalAddrSpace(SrcAS) && |
| 750 | AMDGPU::isFlatGlobalAddrSpace(DestAS); |
| 751 | } |
| 752 | |
| 753 | unsigned AMDGPUTargetMachine::getAssumedAddrSpace(const Value *V) const { |
| 754 | const auto *LD = dyn_cast<LoadInst>(V); |
| 755 | if (!LD) |
| 756 | return AMDGPUAS::UNKNOWN_ADDRESS_SPACE; |
| 757 | |
| 758 | // It must be a generic pointer loaded. |
| 759 | assert(V->getType()->isPointerTy() &&(static_cast <bool> (V->getType()->isPointerTy() && V->getType()->getPointerAddressSpace() == AMDGPUAS::FLAT_ADDRESS ) ? void (0) : __assert_fail ("V->getType()->isPointerTy() && V->getType()->getPointerAddressSpace() == AMDGPUAS::FLAT_ADDRESS" , "llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp", 760, __extension__ __PRETTY_FUNCTION__)) |
| 760 | V->getType()->getPointerAddressSpace() == AMDGPUAS::FLAT_ADDRESS)(static_cast <bool> (V->getType()->isPointerTy() && V->getType()->getPointerAddressSpace() == AMDGPUAS::FLAT_ADDRESS ) ? void (0) : __assert_fail ("V->getType()->isPointerTy() && V->getType()->getPointerAddressSpace() == AMDGPUAS::FLAT_ADDRESS" , "llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp", 760, __extension__ __PRETTY_FUNCTION__)); |
| 761 | |
| 762 | const auto *Ptr = LD->getPointerOperand(); |
| 763 | if (Ptr->getType()->getPointerAddressSpace() != AMDGPUAS::CONSTANT_ADDRESS) |
| 764 | return AMDGPUAS::UNKNOWN_ADDRESS_SPACE; |
| 765 | // For a generic pointer loaded from the constant memory, it could be assumed |
| 766 | // as a global pointer since the constant memory is only populated on the |
| 767 | // host side. As implied by the offload programming model, only global |
| 768 | // pointers could be referenced on the host side. |
| 769 | return AMDGPUAS::GLOBAL_ADDRESS; |
| 770 | } |
| 771 | |
| 772 | std::pair<const Value *, unsigned> |
| 773 | AMDGPUTargetMachine::getPredicatedAddrSpace(const Value *V) const { |
| 774 | if (auto *II = dyn_cast<IntrinsicInst>(V)) { |
| 775 | switch (II->getIntrinsicID()) { |
| 776 | case Intrinsic::amdgcn_is_shared: |
| 777 | return std::pair(II->getArgOperand(0), AMDGPUAS::LOCAL_ADDRESS); |
| 778 | case Intrinsic::amdgcn_is_private: |
| 779 | return std::pair(II->getArgOperand(0), AMDGPUAS::PRIVATE_ADDRESS); |
| 780 | default: |
| 781 | break; |
| 782 | } |
| 783 | return std::pair(nullptr, -1); |
| 784 | } |
| 785 | // Check the global pointer predication based on |
| 786 | // (!is_share(p) && !is_private(p)). Note that logic 'and' is commutative and |
| 787 | // the order of 'is_shared' and 'is_private' is not significant. |
| 788 | Value *Ptr; |
| 789 | if (match( |
| 790 | const_cast<Value *>(V), |
| 791 | m_c_And(m_Not(m_Intrinsic<Intrinsic::amdgcn_is_shared>(m_Value(Ptr))), |
| 792 | m_Not(m_Intrinsic<Intrinsic::amdgcn_is_private>( |
| 793 | m_Deferred(Ptr)))))) |
| 794 | return std::pair(Ptr, AMDGPUAS::GLOBAL_ADDRESS); |
| 795 | |
| 796 | return std::pair(nullptr, -1); |
| 797 | } |
| 798 | |
| 799 | unsigned |
| 800 | AMDGPUTargetMachine::getAddressSpaceForPseudoSourceKind(unsigned Kind) const { |
| 801 | switch (Kind) { |
| 802 | case PseudoSourceValue::Stack: |
| 803 | case PseudoSourceValue::FixedStack: |
| 804 | return AMDGPUAS::PRIVATE_ADDRESS; |
| 805 | case PseudoSourceValue::ConstantPool: |
| 806 | case PseudoSourceValue::GOT: |
| 807 | case PseudoSourceValue::JumpTable: |
| 808 | case PseudoSourceValue::GlobalValueCallEntry: |
| 809 | case PseudoSourceValue::ExternalSymbolCallEntry: |
| 810 | return AMDGPUAS::CONSTANT_ADDRESS; |
| 811 | } |
| 812 | return AMDGPUAS::FLAT_ADDRESS; |
| 813 | } |
| 814 | |
| 815 | //===----------------------------------------------------------------------===// |
| 816 | // GCN Target Machine (SI+) |
| 817 | //===----------------------------------------------------------------------===// |
| 818 | |
| 819 | GCNTargetMachine::GCNTargetMachine(const Target &T, const Triple &TT, |
| 820 | StringRef CPU, StringRef FS, |
| 821 | TargetOptions Options, |
| 822 | std::optional<Reloc::Model> RM, |
| 823 | std::optional<CodeModel::Model> CM, |
| 824 | CodeGenOpt::Level OL, bool JIT) |
| 825 | : AMDGPUTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL) {} |
| 826 | |
| 827 | const TargetSubtargetInfo * |
| 828 | GCNTargetMachine::getSubtargetImpl(const Function &F) const { |
| 829 | StringRef GPU = getGPUName(F); |
| 830 | StringRef FS = getFeatureString(F); |
| 831 | |
| 832 | SmallString<128> SubtargetKey(GPU); |
| 833 | SubtargetKey.append(FS); |
| 834 | |
| 835 | auto &I = SubtargetMap[SubtargetKey]; |
| 836 | if (!I) { |
| 837 | // This needs to be done before we create a new subtarget since any |
| 838 | // creation will depend on the TM and the code generation flags on the |
| 839 | // function that reside in TargetOptions. |
| 840 | resetTargetOptions(F); |
| 841 | I = std::make_unique<GCNSubtarget>(TargetTriple, GPU, FS, *this); |
| 842 | } |
| 843 | |
| 844 | I->setScalarizeGlobalBehavior(ScalarizeGlobal); |
| 845 | |
| 846 | return I.get(); |
| 847 | } |
| 848 | |
| 849 | TargetTransformInfo |
| 850 | GCNTargetMachine::getTargetTransformInfo(const Function &F) const { |
| 851 | return TargetTransformInfo(GCNTTIImpl(this, F)); |
| 852 | } |
| 853 | |
| 854 | //===----------------------------------------------------------------------===// |
| 855 | // AMDGPU Pass Setup |
| 856 | //===----------------------------------------------------------------------===// |
| 857 | |
| 858 | std::unique_ptr<CSEConfigBase> llvm::AMDGPUPassConfig::getCSEConfig() const { |
| 859 | return getStandardCSEConfigForOpt(TM->getOptLevel()); |
| 860 | } |
| 861 | |
| 862 | namespace { |
| 863 | |
| 864 | class GCNPassConfig final : public AMDGPUPassConfig { |
| 865 | public: |
| 866 | GCNPassConfig(LLVMTargetMachine &TM, PassManagerBase &PM) |
| 867 | : AMDGPUPassConfig(TM, PM) { |
| 868 | // It is necessary to know the register usage of the entire call graph. We |
| 869 | // allow calls without EnableAMDGPUFunctionCalls if they are marked |
| 870 | // noinline, so this is always required. |
| 871 | setRequiresCodeGenSCCOrder(true); |
| 872 | substitutePass(&PostRASchedulerID, &PostMachineSchedulerID); |
| 873 | } |
| 874 | |
| 875 | GCNTargetMachine &getGCNTargetMachine() const { |
| 876 | return getTM<GCNTargetMachine>(); |
| 877 | } |
| 878 | |
| 879 | ScheduleDAGInstrs * |
| 880 | createMachineScheduler(MachineSchedContext *C) const override; |
| 881 | |
| 882 | ScheduleDAGInstrs * |
| 883 | createPostMachineScheduler(MachineSchedContext *C) const override { |
| 884 | ScheduleDAGMI *DAG = new GCNPostScheduleDAGMILive( |
| 885 | C, std::make_unique<PostGenericScheduler>(C), |
| 886 | /*RemoveKillFlags=*/true); |
| 887 | const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>(); |
| 888 | DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI)); |
| 889 | if (ST.shouldClusterStores()) |
| 890 | DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI)); |
| 891 | DAG->addMutation(ST.createFillMFMAShadowMutation(DAG->TII)); |
| 892 | DAG->addMutation(createIGroupLPDAGMutation()); |
| 893 | if (isPassEnabled(EnableVOPD, CodeGenOpt::Less)) |
| 894 | DAG->addMutation(createVOPDPairingMutation()); |
| 895 | return DAG; |
| 896 | } |
| 897 | |
| 898 | bool addPreISel() override; |
| 899 | void addMachineSSAOptimization() override; |
| 900 | bool addILPOpts() override; |
| 901 | bool addInstSelector() override; |
| 902 | bool addIRTranslator() override; |
| 903 | void addPreLegalizeMachineIR() override; |
| 904 | bool addLegalizeMachineIR() override; |
| 905 | void addPreRegBankSelect() override; |
| 906 | bool addRegBankSelect() override; |
| 907 | void addPreGlobalInstructionSelect() override; |
| 908 | bool addGlobalInstructionSelect() override; |
| 909 | void addFastRegAlloc() override; |
| 910 | void addOptimizedRegAlloc() override; |
| 911 | |
| 912 | FunctionPass *createSGPRAllocPass(bool Optimized); |
| 913 | FunctionPass *createVGPRAllocPass(bool Optimized); |
| 914 | FunctionPass *createRegAllocPass(bool Optimized) override; |
| 915 | |
| 916 | bool addRegAssignAndRewriteFast() override; |
| 917 | bool addRegAssignAndRewriteOptimized() override; |
| 918 | |
| 919 | void addPreRegAlloc() override; |
| 920 | bool addPreRewrite() override; |
| 921 | void addPostRegAlloc() override; |
| 922 | void addPreSched2() override; |
| 923 | void addPreEmitPass() override; |
| 924 | }; |
| 925 | |
| 926 | } // end anonymous namespace |
| 927 | |
| 928 | AMDGPUPassConfig::AMDGPUPassConfig(LLVMTargetMachine &TM, PassManagerBase &PM) |
| 929 | : TargetPassConfig(TM, PM) { |
| 930 | // Exceptions and StackMaps are not supported, so these passes will never do |
| 931 | // anything. |
| 932 | disablePass(&StackMapLivenessID); |
| 933 | disablePass(&FuncletLayoutID); |
| 934 | // Garbage collection is not supported. |
| 935 | disablePass(&GCLoweringID); |
| 936 | disablePass(&ShadowStackGCLoweringID); |
| 937 | } |
| 938 | |
| 939 | void AMDGPUPassConfig::addEarlyCSEOrGVNPass() { |
| 940 | if (getOptLevel() == CodeGenOpt::Aggressive) |
| 941 | addPass(createGVNPass()); |
| 942 | else |
| 943 | addPass(createEarlyCSEPass()); |
| 944 | } |
| 945 | |
| 946 | void AMDGPUPassConfig::addStraightLineScalarOptimizationPasses() { |
| 947 | addPass(createLICMPass()); |
| 948 | addPass(createSeparateConstOffsetFromGEPPass()); |
| 949 | // ReassociateGEPs exposes more opportunities for SLSR. See |
| 950 | // the example in reassociate-geps-and-slsr.ll. |
| 951 | addPass(createStraightLineStrengthReducePass()); |
| 952 | // SeparateConstOffsetFromGEP and SLSR creates common expressions which GVN or |
| 953 | // EarlyCSE can reuse. |
| 954 | addEarlyCSEOrGVNPass(); |
| 955 | // Run NaryReassociate after EarlyCSE/GVN to be more effective. |
| 956 | addPass(createNaryReassociatePass()); |
| 957 | // NaryReassociate on GEPs creates redundant common expressions, so run |
| 958 | // EarlyCSE after it. |
| 959 | addPass(createEarlyCSEPass()); |
| 960 | } |
| 961 | |
| 962 | void AMDGPUPassConfig::addIRPasses() { |
| 963 | const AMDGPUTargetMachine &TM = getAMDGPUTargetMachine(); |
| 964 | |
| 965 | // There is no reason to run these. |
| 966 | disablePass(&StackMapLivenessID); |
| 967 | disablePass(&FuncletLayoutID); |
| 968 | disablePass(&PatchableFunctionID); |
| 969 | |
| 970 | addPass(createAMDGPUPrintfRuntimeBinding()); |
| 971 | addPass(createAMDGPUCtorDtorLoweringLegacyPass()); |
| 972 | |
| 973 | // A call to propagate attributes pass in the backend in case opt was not run. |
| 974 | addPass(createAMDGPUPropagateAttributesEarlyPass(&TM)); |
| 975 | |
| 976 | addPass(createAMDGPULowerIntrinsicsPass()); |
| 977 | |
| 978 | // Function calls are not supported, so make sure we inline everything. |
| 979 | addPass(createAMDGPUAlwaysInlinePass()); |
| 980 | addPass(createAlwaysInlinerLegacyPass()); |
| 981 | |
| 982 | // Handle uses of OpenCL image2d_t, image3d_t and sampler_t arguments. |
| 983 | if (TM.getTargetTriple().getArch() == Triple::r600) |
| 984 | addPass(createR600OpenCLImageTypeLoweringPass()); |
| 985 | |
| 986 | // Replace OpenCL enqueued block function pointers with global variables. |
| 987 | addPass(createAMDGPUOpenCLEnqueuedBlockLoweringPass()); |
| 988 | |
| 989 | // Runs before PromoteAlloca so the latter can account for function uses |
| 990 | if (EnableLowerModuleLDS) { |
| 991 | addPass(createAMDGPULowerModuleLDSPass()); |
| 992 | } |
| 993 | |
| 994 | if (TM.getOptLevel() > CodeGenOpt::None) |
| 995 | addPass(createInferAddressSpacesPass()); |
| 996 | |
| 997 | addPass(createAtomicExpandPass()); |
| 998 | |
| 999 | if (TM.getOptLevel() > CodeGenOpt::None) { |
| 1000 | addPass(createAMDGPUPromoteAlloca()); |
| 1001 | |
| 1002 | if (EnableSROA) |
| 1003 | addPass(createSROAPass()); |
| 1004 | if (isPassEnabled(EnableScalarIRPasses)) |
| 1005 | addStraightLineScalarOptimizationPasses(); |
| 1006 | |
| 1007 | if (EnableAMDGPUAliasAnalysis) { |
| 1008 | addPass(createAMDGPUAAWrapperPass()); |
| 1009 | addPass(createExternalAAWrapperPass([](Pass &P, Function &, |
| 1010 | AAResults &AAR) { |
| 1011 | if (auto *WrapperPass = P.getAnalysisIfAvailable<AMDGPUAAWrapperPass>()) |
| 1012 | AAR.addAAResult(WrapperPass->getResult()); |
| 1013 | })); |
| 1014 | } |
| 1015 | |
| 1016 | if (TM.getTargetTriple().getArch() == Triple::amdgcn) { |
| 1017 | // TODO: May want to move later or split into an early and late one. |
| 1018 | addPass(createAMDGPUCodeGenPreparePass()); |
| 1019 | } |
| 1020 | } |
| 1021 | |
| 1022 | TargetPassConfig::addIRPasses(); |
| 1023 | |
| 1024 | // EarlyCSE is not always strong enough to clean up what LSR produces. For |
| 1025 | // example, GVN can combine |
| 1026 | // |
| 1027 | // %0 = add %a, %b |
| 1028 | // %1 = add %b, %a |
| 1029 | // |
| 1030 | // and |
| 1031 | // |
| 1032 | // %0 = shl nsw %a, 2 |
| 1033 | // %1 = shl %a, 2 |
| 1034 | // |
| 1035 | // but EarlyCSE can do neither of them. |
| 1036 | if (isPassEnabled(EnableScalarIRPasses)) |
| 1037 | addEarlyCSEOrGVNPass(); |
| 1038 | } |
| 1039 | |
| 1040 | void AMDGPUPassConfig::addCodeGenPrepare() { |
| 1041 | if (TM->getTargetTriple().getArch() == Triple::amdgcn) { |
| 1042 | if (RemoveIncompatibleFunctions) |
| 1043 | addPass(createAMDGPURemoveIncompatibleFunctionsPass(TM)); |
| 1044 | |
| 1045 | addPass(createAMDGPUAttributorPass()); |
| 1046 | |
| 1047 | // FIXME: This pass adds 2 hacky attributes that can be replaced with an |
| 1048 | // analysis, and should be removed. |
| 1049 | addPass(createAMDGPUAnnotateKernelFeaturesPass()); |
| 1050 | } |
| 1051 | |
| 1052 | if (TM->getTargetTriple().getArch() == Triple::amdgcn && |
| 1053 | EnableLowerKernelArguments) |
| 1054 | addPass(createAMDGPULowerKernelArgumentsPass()); |
| 1055 | |
| 1056 | TargetPassConfig::addCodeGenPrepare(); |
| 1057 | |
| 1058 | if (isPassEnabled(EnableLoadStoreVectorizer)) |
| 1059 | addPass(createLoadStoreVectorizerPass()); |
| 1060 | |
| 1061 | // LowerSwitch pass may introduce unreachable blocks that can |
| 1062 | // cause unexpected behavior for subsequent passes. Placing it |
| 1063 | // here seems better that these blocks would get cleaned up by |
| 1064 | // UnreachableBlockElim inserted next in the pass flow. |
| 1065 | addPass(createLowerSwitchPass()); |
| 1066 | } |
| 1067 | |
| 1068 | bool AMDGPUPassConfig::addPreISel() { |
| 1069 | if (TM->getOptLevel() > CodeGenOpt::None) |
| 1070 | addPass(createFlattenCFGPass()); |
| 1071 | return false; |
| 1072 | } |
| 1073 | |
| 1074 | bool AMDGPUPassConfig::addInstSelector() { |
| 1075 | addPass(createAMDGPUISelDag(getAMDGPUTargetMachine(), getOptLevel())); |
| 1076 | return false; |
| 1077 | } |
| 1078 | |
| 1079 | bool AMDGPUPassConfig::addGCPasses() { |
| 1080 | // Do nothing. GC is not supported. |
| 1081 | return false; |
| 1082 | } |
| 1083 | |
| 1084 | llvm::ScheduleDAGInstrs * |
| 1085 | AMDGPUPassConfig::createMachineScheduler(MachineSchedContext *C) const { |
| 1086 | const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>(); |
| 1087 | ScheduleDAGMILive *DAG = createGenericSchedLive(C); |
| 1088 | DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI)); |
| 1089 | if (ST.shouldClusterStores()) |
| 1090 | DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI)); |
| 1091 | return DAG; |
| 1092 | } |
| 1093 | |
| 1094 | MachineFunctionInfo *R600TargetMachine::createMachineFunctionInfo( |
| 1095 | BumpPtrAllocator &Allocator, const Function &F, |
| 1096 | const TargetSubtargetInfo *STI) const { |
| 1097 | return R600MachineFunctionInfo::create<R600MachineFunctionInfo>( |
| 1098 | Allocator, F, static_cast<const R600Subtarget *>(STI)); |
| 1099 | } |
| 1100 | |
| 1101 | //===----------------------------------------------------------------------===// |
| 1102 | // GCN Pass Setup |
| 1103 | //===----------------------------------------------------------------------===// |
| 1104 | |
| 1105 | ScheduleDAGInstrs *GCNPassConfig::createMachineScheduler( |
| 1106 | MachineSchedContext *C) const { |
| 1107 | const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>(); |
| 1108 | if (ST.enableSIScheduler()) |
| 1109 | return createSIMachineScheduler(C); |
| 1110 | |
| 1111 | if (EnableMaxIlpSchedStrategy) |
| 1112 | return createGCNMaxILPMachineScheduler(C); |
| 1113 | |
| 1114 | return createGCNMaxOccupancyMachineScheduler(C); |
| 1115 | } |
| 1116 | |
| 1117 | bool GCNPassConfig::addPreISel() { |
| 1118 | AMDGPUPassConfig::addPreISel(); |
| 1119 | |
| 1120 | if (TM->getOptLevel() > CodeGenOpt::None) |
| 1121 | addPass(createAMDGPULateCodeGenPreparePass()); |
| 1122 | |
| 1123 | if (isPassEnabled(EnableAtomicOptimizations, CodeGenOpt::Less)) { |
| 1124 | addPass(createAMDGPUAtomicOptimizerPass()); |
| 1125 | } |
| 1126 | |
| 1127 | if (TM->getOptLevel() > CodeGenOpt::None) |
| 1128 | addPass(createSinkingPass()); |
| 1129 | |
| 1130 | // Merge divergent exit nodes. StructurizeCFG won't recognize the multi-exit |
| 1131 | // regions formed by them. |
| 1132 | addPass(&AMDGPUUnifyDivergentExitNodesID); |
| 1133 | if (!LateCFGStructurize) { |
| 1134 | if (EnableStructurizerWorkarounds) { |
| 1135 | addPass(createFixIrreduciblePass()); |
| 1136 | addPass(createUnifyLoopExitsPass()); |
| 1137 | } |
| 1138 | addPass(createStructurizeCFGPass(false)); // true -> SkipUniformRegions |
| 1139 | } |
| 1140 | addPass(createAMDGPUAnnotateUniformValues()); |
| 1141 | if (!LateCFGStructurize) { |
| 1142 | addPass(createSIAnnotateControlFlowPass()); |
| 1143 | // TODO: Move this right after structurizeCFG to avoid extra divergence |
| 1144 | // analysis. This depends on stopping SIAnnotateControlFlow from making |
| 1145 | // control flow modifications. |
| 1146 | addPass(createAMDGPURewriteUndefForPHIPass()); |
| 1147 | } |
| 1148 | addPass(createLCSSAPass()); |
| 1149 | |
| 1150 | if (TM->getOptLevel() > CodeGenOpt::Less) |
| 1151 | addPass(&AMDGPUPerfHintAnalysisID); |
| 1152 | |
| 1153 | return false; |
| 1154 | } |
| 1155 | |
| 1156 | void GCNPassConfig::addMachineSSAOptimization() { |
| 1157 | TargetPassConfig::addMachineSSAOptimization(); |
| 1158 | |
| 1159 | // We want to fold operands after PeepholeOptimizer has run (or as part of |
| 1160 | // it), because it will eliminate extra copies making it easier to fold the |
| 1161 | // real source operand. We want to eliminate dead instructions after, so that |
| 1162 | // we see fewer uses of the copies. We then need to clean up the dead |
| 1163 | // instructions leftover after the operands are folded as well. |
| 1164 | // |
| 1165 | // XXX - Can we get away without running DeadMachineInstructionElim again? |
| 1166 | addPass(&SIFoldOperandsID); |
| 1167 | if (EnableDPPCombine) |
| 1168 | addPass(&GCNDPPCombineID); |
| 1169 | addPass(&SILoadStoreOptimizerID); |
| 1170 | if (isPassEnabled(EnableSDWAPeephole)) { |
| 1171 | addPass(&SIPeepholeSDWAID); |
| 1172 | addPass(&EarlyMachineLICMID); |
| 1173 | addPass(&MachineCSEID); |
| 1174 | addPass(&SIFoldOperandsID); |
| 1175 | } |
| 1176 | addPass(&DeadMachineInstructionElimID); |
| 1177 | addPass(createSIShrinkInstructionsPass()); |
| 1178 | } |
| 1179 | |
| 1180 | bool GCNPassConfig::addILPOpts() { |
| 1181 | if (EnableEarlyIfConversion) |
| 1182 | addPass(&EarlyIfConverterID); |
| 1183 | |
| 1184 | TargetPassConfig::addILPOpts(); |
| 1185 | return false; |
| 1186 | } |
| 1187 | |
| 1188 | bool GCNPassConfig::addInstSelector() { |
| 1189 | AMDGPUPassConfig::addInstSelector(); |
| 1190 | addPass(&SIFixSGPRCopiesID); |
| 1191 | addPass(createSILowerI1CopiesPass()); |
| 1192 | return false; |
| 1193 | } |
| 1194 | |
| 1195 | bool GCNPassConfig::addIRTranslator() { |
| 1196 | addPass(new IRTranslator(getOptLevel())); |
| 1197 | return false; |
| 1198 | } |
| 1199 | |
| 1200 | void GCNPassConfig::addPreLegalizeMachineIR() { |
| 1201 | bool IsOptNone = getOptLevel() == CodeGenOpt::None; |
| 1202 | addPass(createAMDGPUPreLegalizeCombiner(IsOptNone)); |
| 1203 | addPass(new Localizer()); |
| 1204 | } |
| 1205 | |
| 1206 | bool GCNPassConfig::addLegalizeMachineIR() { |
| 1207 | addPass(new Legalizer()); |
| 1208 | return false; |
| 1209 | } |
| 1210 | |
| 1211 | void GCNPassConfig::addPreRegBankSelect() { |
| 1212 | bool IsOptNone = getOptLevel() == CodeGenOpt::None; |
| 1213 | addPass(createAMDGPUPostLegalizeCombiner(IsOptNone)); |
| 1214 | } |
| 1215 | |
| 1216 | bool GCNPassConfig::addRegBankSelect() { |
| 1217 | addPass(new AMDGPURegBankSelect()); |
| 1218 | return false; |
| 1219 | } |
| 1220 | |
| 1221 | void GCNPassConfig::addPreGlobalInstructionSelect() { |
| 1222 | bool IsOptNone = getOptLevel() == CodeGenOpt::None; |
| 1223 | addPass(createAMDGPURegBankCombiner(IsOptNone)); |
| 1224 | } |
| 1225 | |
| 1226 | bool GCNPassConfig::addGlobalInstructionSelect() { |
| 1227 | addPass(new InstructionSelect(getOptLevel())); |
| 1228 | return false; |
| 1229 | } |
| 1230 | |
| 1231 | void GCNPassConfig::addPreRegAlloc() { |
| 1232 | if (LateCFGStructurize) { |
| 1233 | addPass(createAMDGPUMachineCFGStructurizerPass()); |
| 1234 | } |
| 1235 | } |
| 1236 | |
| 1237 | void GCNPassConfig::addFastRegAlloc() { |
| 1238 | // FIXME: We have to disable the verifier here because of PHIElimination + |
| 1239 | // TwoAddressInstructions disabling it. |
| 1240 | |
| 1241 | // This must be run immediately after phi elimination and before |
| 1242 | // TwoAddressInstructions, otherwise the processing of the tied operand of |
| 1243 | // SI_ELSE will introduce a copy of the tied operand source after the else. |
| 1244 | insertPass(&PHIEliminationID, &SILowerControlFlowID); |
| 1245 | |
| 1246 | insertPass(&TwoAddressInstructionPassID, &SIWholeQuadModeID); |
| 1247 | insertPass(&TwoAddressInstructionPassID, &SIPreAllocateWWMRegsID); |
| 1248 | |
| 1249 | TargetPassConfig::addFastRegAlloc(); |
| 1250 | } |
| 1251 | |
| 1252 | void GCNPassConfig::addOptimizedRegAlloc() { |
| 1253 | // Allow the scheduler to run before SIWholeQuadMode inserts exec manipulation |
| 1254 | // instructions that cause scheduling barriers. |
| 1255 | insertPass(&MachineSchedulerID, &SIWholeQuadModeID); |
| 1256 | insertPass(&MachineSchedulerID, &SIPreAllocateWWMRegsID); |
| 1257 | |
| 1258 | if (OptExecMaskPreRA) |
| 1259 | insertPass(&MachineSchedulerID, &SIOptimizeExecMaskingPreRAID); |
| 1260 | |
| 1261 | if (isPassEnabled(EnablePreRAOptimizations)) |
| 1262 | insertPass(&RenameIndependentSubregsID, &GCNPreRAOptimizationsID); |
| 1263 | |
| 1264 | // This is not an essential optimization and it has a noticeable impact on |
| 1265 | // compilation time, so we only enable it from O2. |
| 1266 | if (TM->getOptLevel() > CodeGenOpt::Less) |
| 1267 | insertPass(&MachineSchedulerID, &SIFormMemoryClausesID); |
| 1268 | |
| 1269 | // FIXME: when an instruction has a Killed operand, and the instruction is |
| 1270 | // inside a bundle, seems only the BUNDLE instruction appears as the Kills of |
| 1271 | // the register in LiveVariables, this would trigger a failure in verifier, |
| 1272 | // we should fix it and enable the verifier. |
| 1273 | if (OptVGPRLiveRange) |
| 1274 | insertPass(&LiveVariablesID, &SIOptimizeVGPRLiveRangeID); |
| 1275 | // This must be run immediately after phi elimination and before |
| 1276 | // TwoAddressInstructions, otherwise the processing of the tied operand of |
| 1277 | // SI_ELSE will introduce a copy of the tied operand source after the else. |
| 1278 | insertPass(&PHIEliminationID, &SILowerControlFlowID); |
| 1279 | |
| 1280 | if (EnableDCEInRA) |
| 1281 | insertPass(&DetectDeadLanesID, &DeadMachineInstructionElimID); |
| 1282 | |
| 1283 | TargetPassConfig::addOptimizedRegAlloc(); |
| 1284 | } |
| 1285 | |
| 1286 | bool GCNPassConfig::addPreRewrite() { |
| 1287 | if (EnableRegReassign) |
| 1288 | addPass(&GCNNSAReassignID); |
| 1289 | return true; |
| 1290 | } |
| 1291 | |
| 1292 | FunctionPass *GCNPassConfig::createSGPRAllocPass(bool Optimized) { |
| 1293 | // Initialize the global default. |
| 1294 | llvm::call_once(InitializeDefaultSGPRRegisterAllocatorFlag, |
| 1295 | initializeDefaultSGPRRegisterAllocatorOnce); |
| 1296 | |
| 1297 | RegisterRegAlloc::FunctionPassCtor Ctor = SGPRRegisterRegAlloc::getDefault(); |
| 1298 | if (Ctor != useDefaultRegisterAllocator) |
| 1299 | return Ctor(); |
| 1300 | |
| 1301 | if (Optimized) |
| 1302 | return createGreedyRegisterAllocator(onlyAllocateSGPRs); |
| 1303 | |
| 1304 | return createFastRegisterAllocator(onlyAllocateSGPRs, false); |
| 1305 | } |
| 1306 | |
| 1307 | FunctionPass *GCNPassConfig::createVGPRAllocPass(bool Optimized) { |
| 1308 | // Initialize the global default. |
| 1309 | llvm::call_once(InitializeDefaultVGPRRegisterAllocatorFlag, |
| 1310 | initializeDefaultVGPRRegisterAllocatorOnce); |
| 1311 | |
| 1312 | RegisterRegAlloc::FunctionPassCtor Ctor = VGPRRegisterRegAlloc::getDefault(); |
| 1313 | if (Ctor != useDefaultRegisterAllocator) |
| 1314 | return Ctor(); |
| 1315 | |
| 1316 | if (Optimized) |
| 1317 | return createGreedyVGPRRegisterAllocator(); |
| 1318 | |
| 1319 | return createFastVGPRRegisterAllocator(); |
| 1320 | } |
| 1321 | |
| 1322 | FunctionPass *GCNPassConfig::createRegAllocPass(bool Optimized) { |
| 1323 | llvm_unreachable("should not be used")::llvm::llvm_unreachable_internal("should not be used", "llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp" , 1323); |
| 1324 | } |
| 1325 | |
| 1326 | static const char RegAllocOptNotSupportedMessage[] = |
| 1327 | "-regalloc not supported with amdgcn. Use -sgpr-regalloc and -vgpr-regalloc"; |
| 1328 | |
| 1329 | bool GCNPassConfig::addRegAssignAndRewriteFast() { |
| 1330 | if (!usingDefaultRegAlloc()) |
| 1331 | report_fatal_error(RegAllocOptNotSupportedMessage); |
| 1332 | |
| 1333 | addPass(createSGPRAllocPass(false)); |
| 1334 | |
| 1335 | // Equivalent of PEI for SGPRs. |
| 1336 | addPass(&SILowerSGPRSpillsID); |
| 1337 | |
| 1338 | addPass(createVGPRAllocPass(false)); |
| 1339 | return true; |
| 1340 | } |
| 1341 | |
| 1342 | bool GCNPassConfig::addRegAssignAndRewriteOptimized() { |
| 1343 | if (!usingDefaultRegAlloc()) |
| 1344 | report_fatal_error(RegAllocOptNotSupportedMessage); |
| 1345 | |
| 1346 | addPass(createSGPRAllocPass(true)); |
| 1347 | |
| 1348 | // Commit allocated register changes. This is mostly necessary because too |
| 1349 | // many things rely on the use lists of the physical registers, such as the |
| 1350 | // verifier. This is only necessary with allocators which use LiveIntervals, |
| 1351 | // since FastRegAlloc does the replacements itself. |
| 1352 | addPass(createVirtRegRewriter(false)); |
| 1353 | |
| 1354 | // Equivalent of PEI for SGPRs. |
| 1355 | addPass(&SILowerSGPRSpillsID); |
| 1356 | |
| 1357 | addPass(createVGPRAllocPass(true)); |
| 1358 | |
| 1359 | addPreRewrite(); |
| 1360 | addPass(&VirtRegRewriterID); |
| 1361 | |
| 1362 | return true; |
| 1363 | } |
| 1364 | |
| 1365 | void GCNPassConfig::addPostRegAlloc() { |
| 1366 | addPass(&SIFixVGPRCopiesID); |
| 1367 | if (getOptLevel() > CodeGenOpt::None) |
| 1368 | addPass(&SIOptimizeExecMaskingID); |
| 1369 | TargetPassConfig::addPostRegAlloc(); |
| 1370 | } |
| 1371 | |
| 1372 | void GCNPassConfig::addPreSched2() { |
| 1373 | if (TM->getOptLevel() > CodeGenOpt::None) |
| 1374 | addPass(createSIShrinkInstructionsPass()); |
| 1375 | addPass(&SIPostRABundlerID); |
| 1376 | } |
| 1377 | |
| 1378 | void GCNPassConfig::addPreEmitPass() { |
| 1379 | if (isPassEnabled(EnableVOPD, CodeGenOpt::Less)) |
| 1380 | addPass(&GCNCreateVOPDID); |
| 1381 | addPass(createSIMemoryLegalizerPass()); |
| 1382 | addPass(createSIInsertWaitcntsPass()); |
| 1383 | |
| 1384 | addPass(createSIModeRegisterPass()); |
| 1385 | |
| 1386 | if (getOptLevel() > CodeGenOpt::None) |
| 1387 | addPass(&SIInsertHardClausesID); |
| 1388 | |
| 1389 | addPass(&SILateBranchLoweringPassID); |
| 1390 | if (isPassEnabled(EnableSetWavePriority, CodeGenOpt::Less)) |
| 1391 | addPass(createAMDGPUSetWavePriorityPass()); |
| 1392 | if (getOptLevel() > CodeGenOpt::None) |
| 1393 | addPass(&SIPreEmitPeepholeID); |
| 1394 | // The hazard recognizer that runs as part of the post-ra scheduler does not |
| 1395 | // guarantee to be able handle all hazards correctly. This is because if there |
| 1396 | // are multiple scheduling regions in a basic block, the regions are scheduled |
| 1397 | // bottom up, so when we begin to schedule a region we don't know what |
| 1398 | // instructions were emitted directly before it. |
| 1399 | // |
| 1400 | // Here we add a stand-alone hazard recognizer pass which can handle all |
| 1401 | // cases. |
| 1402 | addPass(&PostRAHazardRecognizerID); |
| 1403 | |
| 1404 | if (getOptLevel() > CodeGenOpt::Less) |
| 1405 | addPass(&AMDGPUReleaseVGPRsID); |
| 1406 | |
| 1407 | if (isPassEnabled(EnableInsertDelayAlu, CodeGenOpt::Less)) |
| 1408 | addPass(&AMDGPUInsertDelayAluID); |
| 1409 | |
| 1410 | addPass(&BranchRelaxationPassID); |
| 1411 | } |
| 1412 | |
| 1413 | TargetPassConfig *GCNTargetMachine::createPassConfig(PassManagerBase &PM) { |
| 1414 | return new GCNPassConfig(*this, PM); |
| 1415 | } |
| 1416 | |
| 1417 | MachineFunctionInfo *GCNTargetMachine::createMachineFunctionInfo( |
| 1418 | BumpPtrAllocator &Allocator, const Function &F, |
| 1419 | const TargetSubtargetInfo *STI) const { |
| 1420 | return SIMachineFunctionInfo::create<SIMachineFunctionInfo>( |
| 1421 | Allocator, F, static_cast<const GCNSubtarget *>(STI)); |
| 1422 | } |
| 1423 | |
| 1424 | yaml::MachineFunctionInfo *GCNTargetMachine::createDefaultFuncInfoYAML() const { |
| 1425 | return new yaml::SIMachineFunctionInfo(); |
| 1426 | } |
| 1427 | |
| 1428 | yaml::MachineFunctionInfo * |
| 1429 | GCNTargetMachine::convertFuncInfoToYAML(const MachineFunction &MF) const { |
| 1430 | const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); |
| 1431 | return new yaml::SIMachineFunctionInfo( |
| 1432 | *MFI, *MF.getSubtarget<GCNSubtarget>().getRegisterInfo(), MF); |
| 1433 | } |
| 1434 | |
| 1435 | bool GCNTargetMachine::parseMachineFunctionInfo( |
| 1436 | const yaml::MachineFunctionInfo &MFI_, PerFunctionMIParsingState &PFS, |
| 1437 | SMDiagnostic &Error, SMRange &SourceRange) const { |
| 1438 | const yaml::SIMachineFunctionInfo &YamlMFI = |
| 1439 | static_cast<const yaml::SIMachineFunctionInfo &>(MFI_); |
| 1440 | MachineFunction &MF = PFS.MF; |
| 1441 | SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); |
| 1442 | |
| 1443 | if (MFI->initializeBaseYamlFields(YamlMFI, MF, PFS, Error, SourceRange)) |
| 1444 | return true; |
| 1445 | |
| 1446 | if (MFI->Occupancy == 0) { |
| 1447 | // Fixup the subtarget dependent default value. |
| 1448 | const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); |
| 1449 | MFI->Occupancy = ST.computeOccupancy(MF.getFunction(), MFI->getLDSSize()); |
| 1450 | } |
| 1451 | |
| 1452 | auto parseRegister = [&](const yaml::StringValue &RegName, Register &RegVal) { |
| 1453 | Register TempReg; |
| 1454 | if (parseNamedRegisterReference(PFS, TempReg, RegName.Value, Error)) { |
| 1455 | SourceRange = RegName.SourceRange; |
| 1456 | return true; |
| 1457 | } |
| 1458 | RegVal = TempReg; |
| 1459 | |
| 1460 | return false; |
| 1461 | }; |
| 1462 | |
| 1463 | auto parseOptionalRegister = [&](const yaml::StringValue &RegName, |
| 1464 | Register &RegVal) { |
| 1465 | return !RegName.Value.empty() && parseRegister(RegName, RegVal); |
| 1466 | }; |
| 1467 | |
| 1468 | if (parseOptionalRegister(YamlMFI.VGPRForAGPRCopy, MFI->VGPRForAGPRCopy)) |
| 1469 | return true; |
| 1470 | |
| 1471 | auto diagnoseRegisterClass = [&](const yaml::StringValue &RegName) { |
| 1472 | // Create a diagnostic for a the register string literal. |
| 1473 | const MemoryBuffer &Buffer = |
| 1474 | *PFS.SM->getMemoryBuffer(PFS.SM->getMainFileID()); |
| 1475 | Error = SMDiagnostic(*PFS.SM, SMLoc(), Buffer.getBufferIdentifier(), 1, |
| 1476 | RegName.Value.size(), SourceMgr::DK_Error, |
| 1477 | "incorrect register class for field", RegName.Value, |
| 1478 | std::nullopt, std::nullopt); |
| 1479 | SourceRange = RegName.SourceRange; |
| 1480 | return true; |
| 1481 | }; |
| 1482 | |
| 1483 | if (parseRegister(YamlMFI.ScratchRSrcReg, MFI->ScratchRSrcReg) || |
| 1484 | parseRegister(YamlMFI.FrameOffsetReg, MFI->FrameOffsetReg) || |
| 1485 | parseRegister(YamlMFI.StackPtrOffsetReg, MFI->StackPtrOffsetReg)) |
| 1486 | return true; |
| 1487 | |
| 1488 | if (MFI->ScratchRSrcReg != AMDGPU::PRIVATE_RSRC_REG && |
| 1489 | !AMDGPU::SGPR_128RegClass.contains(MFI->ScratchRSrcReg)) { |
| 1490 | return diagnoseRegisterClass(YamlMFI.ScratchRSrcReg); |
| 1491 | } |
| 1492 | |
| 1493 | if (MFI->FrameOffsetReg != AMDGPU::FP_REG && |
| 1494 | !AMDGPU::SGPR_32RegClass.contains(MFI->FrameOffsetReg)) { |
| 1495 | return diagnoseRegisterClass(YamlMFI.FrameOffsetReg); |
| 1496 | } |
| 1497 | |
| 1498 | if (MFI->StackPtrOffsetReg != AMDGPU::SP_REG && |
| 1499 | !AMDGPU::SGPR_32RegClass.contains(MFI->StackPtrOffsetReg)) { |
| 1500 | return diagnoseRegisterClass(YamlMFI.StackPtrOffsetReg); |
| 1501 | } |
| 1502 | |
| 1503 | for (const auto &YamlReg : YamlMFI.WWMReservedRegs) { |
| 1504 | Register ParsedReg; |
| 1505 | if (parseRegister(YamlReg, ParsedReg)) |
| 1506 | return true; |
| 1507 | |
| 1508 | MFI->reserveWWMRegister(ParsedReg); |
| 1509 | } |
| 1510 | |
| 1511 | auto parseAndCheckArgument = [&](const std::optional<yaml::SIArgument> &A, |
| 1512 | const TargetRegisterClass &RC, |
| 1513 | ArgDescriptor &Arg, unsigned UserSGPRs, |
| 1514 | unsigned SystemSGPRs) { |
| 1515 | // Skip parsing if it's not present. |
| 1516 | if (!A) |
| 1517 | return false; |
| 1518 | |
| 1519 | if (A->IsRegister) { |
| 1520 | Register Reg; |
| 1521 | if (parseNamedRegisterReference(PFS, Reg, A->RegisterName.Value, Error)) { |
| 1522 | SourceRange = A->RegisterName.SourceRange; |
| 1523 | return true; |
| 1524 | } |
| 1525 | if (!RC.contains(Reg)) |
| 1526 | return diagnoseRegisterClass(A->RegisterName); |
| 1527 | Arg = ArgDescriptor::createRegister(Reg); |
| 1528 | } else |
| 1529 | Arg = ArgDescriptor::createStack(A->StackOffset); |
| 1530 | // Check and apply the optional mask. |
| 1531 | if (A->Mask) |
| 1532 | Arg = ArgDescriptor::createArg(Arg, *A->Mask); |
| 1533 | |
| 1534 | MFI->NumUserSGPRs += UserSGPRs; |
| 1535 | MFI->NumSystemSGPRs += SystemSGPRs; |
| 1536 | return false; |
| 1537 | }; |
| 1538 | |
| 1539 | if (YamlMFI.ArgInfo && |
| 1540 | (parseAndCheckArgument(YamlMFI.ArgInfo->PrivateSegmentBuffer, |
| 1541 | AMDGPU::SGPR_128RegClass, |
| 1542 | MFI->ArgInfo.PrivateSegmentBuffer, 4, 0) || |
| 1543 | parseAndCheckArgument(YamlMFI.ArgInfo->DispatchPtr, |
| 1544 | AMDGPU::SReg_64RegClass, MFI->ArgInfo.DispatchPtr, |
| 1545 | 2, 0) || |
| 1546 | parseAndCheckArgument(YamlMFI.ArgInfo->QueuePtr, AMDGPU::SReg_64RegClass, |
| 1547 | MFI->ArgInfo.QueuePtr, 2, 0) || |
| 1548 | parseAndCheckArgument(YamlMFI.ArgInfo->KernargSegmentPtr, |
| 1549 | AMDGPU::SReg_64RegClass, |
| 1550 | MFI->ArgInfo.KernargSegmentPtr, 2, 0) || |
| 1551 | parseAndCheckArgument(YamlMFI.ArgInfo->DispatchID, |
| 1552 | AMDGPU::SReg_64RegClass, MFI->ArgInfo.DispatchID, |
| 1553 | 2, 0) || |
| 1554 | parseAndCheckArgument(YamlMFI.ArgInfo->FlatScratchInit, |
| 1555 | AMDGPU::SReg_64RegClass, |
| 1556 | MFI->ArgInfo.FlatScratchInit, 2, 0) || |
| 1557 | parseAndCheckArgument(YamlMFI.ArgInfo->PrivateSegmentSize, |
| 1558 | AMDGPU::SGPR_32RegClass, |
| 1559 | MFI->ArgInfo.PrivateSegmentSize, 0, 0) || |
| 1560 | parseAndCheckArgument(YamlMFI.ArgInfo->LDSKernelId, |
| 1561 | AMDGPU::SGPR_32RegClass, |
| 1562 | MFI->ArgInfo.LDSKernelId, 0, 1) || |
| 1563 | parseAndCheckArgument(YamlMFI.ArgInfo->WorkGroupIDX, |
| 1564 | AMDGPU::SGPR_32RegClass, MFI->ArgInfo.WorkGroupIDX, |
| 1565 | 0, 1) || |
| 1566 | parseAndCheckArgument(YamlMFI.ArgInfo->WorkGroupIDY, |
| 1567 | AMDGPU::SGPR_32RegClass, MFI->ArgInfo.WorkGroupIDY, |
| 1568 | 0, 1) || |
| 1569 | parseAndCheckArgument(YamlMFI.ArgInfo->WorkGroupIDZ, |
| 1570 | AMDGPU::SGPR_32RegClass, MFI->ArgInfo.WorkGroupIDZ, |
| 1571 | 0, 1) || |
| 1572 | parseAndCheckArgument(YamlMFI.ArgInfo->WorkGroupInfo, |
| 1573 | AMDGPU::SGPR_32RegClass, |
| 1574 | MFI->ArgInfo.WorkGroupInfo, 0, 1) || |
| 1575 | parseAndCheckArgument(YamlMFI.ArgInfo->PrivateSegmentWaveByteOffset, |
| 1576 | AMDGPU::SGPR_32RegClass, |
| 1577 | MFI->ArgInfo.PrivateSegmentWaveByteOffset, 0, 1) || |
| 1578 | parseAndCheckArgument(YamlMFI.ArgInfo->ImplicitArgPtr, |
| 1579 | AMDGPU::SReg_64RegClass, |
| 1580 | MFI->ArgInfo.ImplicitArgPtr, 0, 0) || |
| 1581 | parseAndCheckArgument(YamlMFI.ArgInfo->ImplicitBufferPtr, |
| 1582 | AMDGPU::SReg_64RegClass, |
| 1583 | MFI->ArgInfo.ImplicitBufferPtr, 2, 0) || |
| 1584 | parseAndCheckArgument(YamlMFI.ArgInfo->WorkItemIDX, |
| 1585 | AMDGPU::VGPR_32RegClass, |
| 1586 | MFI->ArgInfo.WorkItemIDX, 0, 0) || |
| 1587 | parseAndCheckArgument(YamlMFI.ArgInfo->WorkItemIDY, |
| 1588 | AMDGPU::VGPR_32RegClass, |
| 1589 | MFI->ArgInfo.WorkItemIDY, 0, 0) || |
| 1590 | parseAndCheckArgument(YamlMFI.ArgInfo->WorkItemIDZ, |
| 1591 | AMDGPU::VGPR_32RegClass, |
| 1592 | MFI->ArgInfo.WorkItemIDZ, 0, 0))) |
| 1593 | return true; |
| 1594 | |
| 1595 | MFI->Mode.IEEE = YamlMFI.Mode.IEEE; |
| 1596 | MFI->Mode.DX10Clamp = YamlMFI.Mode.DX10Clamp; |
| 1597 | |
| 1598 | // FIXME: Move proper support for denormal-fp-math into base MachineFunction |
| 1599 | MFI->Mode.FP32Denormals.Input = YamlMFI.Mode.FP32InputDenormals |
| 1600 | ? DenormalMode::IEEE |
| 1601 | : DenormalMode::PreserveSign; |
| 1602 | MFI->Mode.FP32Denormals.Output = YamlMFI.Mode.FP32OutputDenormals |
| 1603 | ? DenormalMode::IEEE |
| 1604 | : DenormalMode::PreserveSign; |
| 1605 | |
| 1606 | MFI->Mode.FP64FP16Denormals.Input = YamlMFI.Mode.FP64FP16InputDenormals |
| 1607 | ? DenormalMode::IEEE |
| 1608 | : DenormalMode::PreserveSign; |
| 1609 | MFI->Mode.FP64FP16Denormals.Output = YamlMFI.Mode.FP64FP16OutputDenormals |
| 1610 | ? DenormalMode::IEEE |
| 1611 | : DenormalMode::PreserveSign; |
| 1612 | |
| 1613 | return false; |
| 1614 | } |