LCOV - code coverage report
Current view: top level - lib/Target/AMDGPU - AMDGPUTargetMachine.cpp (source / functions) Hit Total Coverage
Test: llvm-toolchain.info Lines: 407 415 98.1 %
Date: 2018-07-13 00:08:38 Functions: 61 66 92.4 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : //===-- AMDGPUTargetMachine.cpp - TargetMachine for hw codegen targets-----===//
       2             : //
       3             : //                     The LLVM Compiler Infrastructure
       4             : //
       5             : // This file is distributed under the University of Illinois Open Source
       6             : // License. See LICENSE.TXT for details.
       7             : //
       8             : //===----------------------------------------------------------------------===//
       9             : //
      10             : /// \file
      11             : /// The AMDGPU target machine contains all of the hardware specific
      12             : /// information  needed to emit code for R600 and SI GPUs.
      13             : //
      14             : //===----------------------------------------------------------------------===//
      15             : 
      16             : #include "AMDGPUTargetMachine.h"
      17             : #include "AMDGPU.h"
      18             : #include "AMDGPUAliasAnalysis.h"
      19             : #include "AMDGPUCallLowering.h"
      20             : #include "AMDGPUInstructionSelector.h"
      21             : #include "AMDGPULegalizerInfo.h"
      22             : #include "AMDGPUMacroFusion.h"
      23             : #include "AMDGPUTargetObjectFile.h"
      24             : #include "AMDGPUTargetTransformInfo.h"
      25             : #include "GCNIterativeScheduler.h"
      26             : #include "GCNSchedStrategy.h"
      27             : #include "R600MachineScheduler.h"
      28             : #include "SIMachineScheduler.h"
      29             : #include "llvm/CodeGen/GlobalISel/IRTranslator.h"
      30             : #include "llvm/CodeGen/GlobalISel/InstructionSelect.h"
      31             : #include "llvm/CodeGen/GlobalISel/Legalizer.h"
      32             : #include "llvm/CodeGen/GlobalISel/RegBankSelect.h"
      33             : #include "llvm/CodeGen/Passes.h"
      34             : #include "llvm/CodeGen/TargetPassConfig.h"
      35             : #include "llvm/IR/Attributes.h"
      36             : #include "llvm/IR/Function.h"
      37             : #include "llvm/IR/LegacyPassManager.h"
      38             : #include "llvm/Pass.h"
      39             : #include "llvm/Support/CommandLine.h"
      40             : #include "llvm/Support/Compiler.h"
      41             : #include "llvm/Support/TargetRegistry.h"
      42             : #include "llvm/Target/TargetLoweringObjectFile.h"
      43             : #include "llvm/Transforms/IPO.h"
      44             : #include "llvm/Transforms/IPO/AlwaysInliner.h"
      45             : #include "llvm/Transforms/IPO/PassManagerBuilder.h"
      46             : #include "llvm/Transforms/Scalar.h"
      47             : #include "llvm/Transforms/Scalar/GVN.h"
      48             : #include "llvm/Transforms/Vectorize.h"
      49             : #include <memory>
      50             : 
      51             : using namespace llvm;
      52             : 
      53       99743 : static cl::opt<bool> EnableR600StructurizeCFG(
      54             :   "r600-ir-structurize",
      55       99743 :   cl::desc("Use StructurizeCFG IR pass"),
      56      299229 :   cl::init(true));
      57             : 
      58       99743 : static cl::opt<bool> EnableSROA(
      59             :   "amdgpu-sroa",
      60       99743 :   cl::desc("Run SROA after promote alloca pass"),
      61             :   cl::ReallyHidden,
      62      299229 :   cl::init(true));
      63             : 
      64             : static cl::opt<bool>
      65       99743 : EnableEarlyIfConversion("amdgpu-early-ifcvt", cl::Hidden,
      66       99743 :                         cl::desc("Run early if-conversion"),
      67      299229 :                         cl::init(false));
      68             : 
      69       99743 : static cl::opt<bool> EnableR600IfConvert(
      70             :   "r600-if-convert",
      71       99743 :   cl::desc("Use if conversion pass"),
      72             :   cl::ReallyHidden,
      73      299229 :   cl::init(true));
      74             : 
      75             : // Option to disable vectorizer for tests.
      76       99743 : static cl::opt<bool> EnableLoadStoreVectorizer(
      77             :   "amdgpu-load-store-vectorizer",
      78       99743 :   cl::desc("Enable load store vectorizer"),
      79      199486 :   cl::init(true),
      80      299229 :   cl::Hidden);
      81             : 
      82             : // Option to control global loads scalarization
      83       99743 : static cl::opt<bool> ScalarizeGlobal(
      84             :   "amdgpu-scalarize-global-loads",
      85       99743 :   cl::desc("Enable global load scalarization"),
      86      199486 :   cl::init(true),
      87      299229 :   cl::Hidden);
      88             : 
      89             : // Option to run internalize pass.
      90       99743 : static cl::opt<bool> InternalizeSymbols(
      91             :   "amdgpu-internalize-symbols",
      92       99743 :   cl::desc("Enable elimination of non-kernel functions and unused globals"),
      93      199486 :   cl::init(false),
      94      299229 :   cl::Hidden);
      95             : 
      96             : // Option to inline all early.
      97       99743 : static cl::opt<bool> EarlyInlineAll(
      98             :   "amdgpu-early-inline-all",
      99       99743 :   cl::desc("Inline all functions early"),
     100      199486 :   cl::init(false),
     101      299229 :   cl::Hidden);
     102             : 
     103       99743 : static cl::opt<bool> EnableSDWAPeephole(
     104             :   "amdgpu-sdwa-peephole",
     105       99743 :   cl::desc("Enable SDWA peepholer"),
     106      299229 :   cl::init(true));
     107             : 
     108             : // Enable address space based alias analysis
     109       99743 : static cl::opt<bool> EnableAMDGPUAliasAnalysis("enable-amdgpu-aa", cl::Hidden,
     110       99743 :   cl::desc("Enable AMDGPU Alias Analysis"),
     111      299229 :   cl::init(true));
     112             : 
     113             : // Option to run late CFG structurizer
     114       99743 : static cl::opt<bool, true> LateCFGStructurize(
     115             :   "amdgpu-late-structurize",
     116       99743 :   cl::desc("Enable late CFG structurization"),
     117      199486 :   cl::location(AMDGPUTargetMachine::EnableLateStructurizeCFG),
     118      299229 :   cl::Hidden);
     119             : 
     120       99743 : static cl::opt<bool, true> EnableAMDGPUFunctionCalls(
     121             :   "amdgpu-function-calls",
     122       99743 :   cl::desc("Enable AMDGPU function call support"),
     123      199486 :   cl::location(AMDGPUTargetMachine::EnableFunctionCalls),
     124      199486 :   cl::init(false),
     125      299229 :   cl::Hidden);
     126             : 
     127             : // Enable lib calls simplifications
     128       99743 : static cl::opt<bool> EnableLibCallSimplify(
     129             :   "amdgpu-simplify-libcall",
     130       99743 :   cl::desc("Enable amdgpu library simplifications"),
     131      199486 :   cl::init(true),
     132      299229 :   cl::Hidden);
     133             : 
     134       99743 : static cl::opt<bool> EnableLowerKernelArguments(
     135             :   "amdgpu-ir-lower-kernel-arguments",
     136       99743 :   cl::desc("Lower kernel argument loads in IR pass"),
     137      199486 :   cl::init(true),
     138      299229 :   cl::Hidden);
     139             : 
     140       98031 : extern "C" void LLVMInitializeAMDGPUTarget() {
     141             :   // Register the target
     142       98031 :   RegisterTargetMachine<R600TargetMachine> X(getTheAMDGPUTarget());
     143       98031 :   RegisterTargetMachine<GCNTargetMachine> Y(getTheGCNTarget());
     144             : 
     145       98031 :   PassRegistry *PR = PassRegistry::getPassRegistry();
     146       98031 :   initializeR600ClauseMergePassPass(*PR);
     147       98031 :   initializeR600ControlFlowFinalizerPass(*PR);
     148       98031 :   initializeR600PacketizerPass(*PR);
     149       98031 :   initializeR600ExpandSpecialInstrsPassPass(*PR);
     150       98031 :   initializeR600VectorRegMergerPass(*PR);
     151       98031 :   initializeGlobalISel(*PR);
     152       98031 :   initializeAMDGPUDAGToDAGISelPass(*PR);
     153       98031 :   initializeSILowerI1CopiesPass(*PR);
     154       98031 :   initializeSIFixSGPRCopiesPass(*PR);
     155       98031 :   initializeSIFixVGPRCopiesPass(*PR);
     156       98031 :   initializeSIFoldOperandsPass(*PR);
     157       98031 :   initializeSIPeepholeSDWAPass(*PR);
     158       98031 :   initializeSIShrinkInstructionsPass(*PR);
     159       98031 :   initializeSIOptimizeExecMaskingPreRAPass(*PR);
     160       98031 :   initializeSILoadStoreOptimizerPass(*PR);
     161       98031 :   initializeAMDGPUAlwaysInlinePass(*PR);
     162       98031 :   initializeAMDGPUAnnotateKernelFeaturesPass(*PR);
     163       98031 :   initializeAMDGPUAnnotateUniformValuesPass(*PR);
     164       98031 :   initializeAMDGPUArgumentUsageInfoPass(*PR);
     165       98031 :   initializeAMDGPULowerKernelArgumentsPass(*PR);
     166       98031 :   initializeAMDGPULowerKernelAttributesPass(*PR);
     167       98031 :   initializeAMDGPULowerIntrinsicsPass(*PR);
     168       98031 :   initializeAMDGPUOpenCLEnqueuedBlockLoweringPass(*PR);
     169       98031 :   initializeAMDGPUPromoteAllocaPass(*PR);
     170       98031 :   initializeAMDGPUCodeGenPreparePass(*PR);
     171       98031 :   initializeAMDGPURewriteOutArgumentsPass(*PR);
     172       98031 :   initializeAMDGPUUnifyMetadataPass(*PR);
     173       98031 :   initializeSIAnnotateControlFlowPass(*PR);
     174       98031 :   initializeSIInsertWaitcntsPass(*PR);
     175       98031 :   initializeSIWholeQuadModePass(*PR);
     176       98031 :   initializeSILowerControlFlowPass(*PR);
     177       98031 :   initializeSIInsertSkipsPass(*PR);
     178       98031 :   initializeSIMemoryLegalizerPass(*PR);
     179       98031 :   initializeSIDebuggerInsertNopsPass(*PR);
     180       98031 :   initializeSIOptimizeExecMaskingPass(*PR);
     181       98031 :   initializeSIFixWWMLivenessPass(*PR);
     182       98031 :   initializeSIFormMemoryClausesPass(*PR);
     183       98031 :   initializeAMDGPUUnifyDivergentExitNodesPass(*PR);
     184       98031 :   initializeAMDGPUAAWrapperPassPass(*PR);
     185       98031 :   initializeAMDGPUUseNativeCallsPass(*PR);
     186       98031 :   initializeAMDGPUSimplifyLibCallsPass(*PR);
     187       98031 :   initializeAMDGPUInlinerPass(*PR);
     188       98031 : }
     189             : 
     190             : static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) {
     191        5158 :   return llvm::make_unique<AMDGPUTargetObjectFile>();
     192             : }
     193             : 
     194        2238 : static ScheduleDAGInstrs *createR600MachineScheduler(MachineSchedContext *C) {
     195        6714 :   return new ScheduleDAGMILive(C, llvm::make_unique<R600SchedStrategy>());
     196             : }
     197             : 
     198           1 : static ScheduleDAGInstrs *createSIMachineScheduler(MachineSchedContext *C) {
     199           1 :   return new SIScheduleDAGMI(C);
     200             : }
     201             : 
     202             : static ScheduleDAGInstrs *
     203       17674 : createGCNMaxOccupancyMachineScheduler(MachineSchedContext *C) {
     204             :   ScheduleDAGMILive *DAG =
     205       35348 :     new GCNScheduleDAGMILive(C, make_unique<GCNMaxOccupancySchedStrategy>(C));
     206       35348 :   DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
     207       35348 :   DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
     208       35348 :   DAG->addMutation(createAMDGPUMacroFusionDAGMutation());
     209       17674 :   return DAG;
     210             : }
     211             : 
     212             : static ScheduleDAGInstrs *
     213           3 : createIterativeGCNMaxOccupancyMachineScheduler(MachineSchedContext *C) {
     214             :   auto DAG = new GCNIterativeScheduler(C,
     215           3 :     GCNIterativeScheduler::SCHEDULE_LEGACYMAXOCCUPANCY);
     216           6 :   DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
     217           6 :   DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
     218           3 :   return DAG;
     219             : }
     220             : 
     221           3 : static ScheduleDAGInstrs *createMinRegScheduler(MachineSchedContext *C) {
     222             :   return new GCNIterativeScheduler(C,
     223           3 :     GCNIterativeScheduler::SCHEDULE_MINREGFORCED);
     224             : }
     225             : 
     226             : static ScheduleDAGInstrs *
     227           2 : createIterativeILPMachineScheduler(MachineSchedContext *C) {
     228             :   auto DAG = new GCNIterativeScheduler(C,
     229           2 :     GCNIterativeScheduler::SCHEDULE_ILP);
     230           4 :   DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
     231           4 :   DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
     232           4 :   DAG->addMutation(createAMDGPUMacroFusionDAGMutation());
     233           2 :   return DAG;
     234             : }
     235             : 
     236             : static MachineSchedRegistry
     237       99743 : R600SchedRegistry("r600", "Run R600's custom scheduler",
     238             :                    createR600MachineScheduler);
     239             : 
     240             : static MachineSchedRegistry
     241       99743 : SISchedRegistry("si", "Run SI's custom scheduler",
     242             :                 createSIMachineScheduler);
     243             : 
     244             : static MachineSchedRegistry
     245       99743 : GCNMaxOccupancySchedRegistry("gcn-max-occupancy",
     246             :                              "Run GCN scheduler to maximize occupancy",
     247             :                              createGCNMaxOccupancyMachineScheduler);
     248             : 
     249             : static MachineSchedRegistry
     250       99743 : IterativeGCNMaxOccupancySchedRegistry("gcn-max-occupancy-experimental",
     251             :   "Run GCN scheduler to maximize occupancy (experimental)",
     252             :   createIterativeGCNMaxOccupancyMachineScheduler);
     253             : 
     254             : static MachineSchedRegistry
     255       99743 : GCNMinRegSchedRegistry("gcn-minreg",
     256             :   "Run GCN iterative scheduler for minimal register usage (experimental)",
     257             :   createMinRegScheduler);
     258             : 
     259             : static MachineSchedRegistry
     260       99743 : GCNILPSchedRegistry("gcn-ilp",
     261             :   "Run GCN iterative scheduler for ILP scheduling (experimental)",
     262             :   createIterativeILPMachineScheduler);
     263             : 
     264             : static StringRef computeDataLayout(const Triple &TT) {
     265        2579 :   if (TT.getArch() == Triple::r600) {
     266             :     // 32-bit pointers.
     267             :       return "e-p:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128"
     268             :              "-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5";
     269             :   }
     270             : 
     271             :   // 32-bit private, local, and region pointers. 64-bit global, constant and
     272             :   // flat.
     273             :     return "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32"
     274             :          "-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128"
     275             :          "-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5";
     276             : }
     277             : 
     278             : LLVM_READNONE
     279             : static StringRef getGPUOrDefault(const Triple &TT, StringRef GPU) {
     280        2579 :   if (!GPU.empty())
     281             :     return GPU;
     282             : 
     283         732 :   if (TT.getArch() == Triple::amdgcn)
     284             :     return "generic";
     285             : 
     286             :   return "r600";
     287             : }
     288             : 
     289             : static Reloc::Model getEffectiveRelocModel(Optional<Reloc::Model> RM) {
     290             :   // The AMDGPU toolchain only supports generating shared objects, so we
     291             :   // must always use PIC.
     292             :   return Reloc::PIC_;
     293             : }
     294             : 
     295             : static CodeModel::Model getEffectiveCodeModel(Optional<CodeModel::Model> CM) {
     296        2579 :   if (CM)
     297             :     return *CM;
     298             :   return CodeModel::Small;
     299             : }
     300             : 
     301        2579 : AMDGPUTargetMachine::AMDGPUTargetMachine(const Target &T, const Triple &TT,
     302             :                                          StringRef CPU, StringRef FS,
     303             :                                          TargetOptions Options,
     304             :                                          Optional<Reloc::Model> RM,
     305             :                                          Optional<CodeModel::Model> CM,
     306        2579 :                                          CodeGenOpt::Level OptLevel)
     307             :     : LLVMTargetMachine(T, computeDataLayout(TT), TT, getGPUOrDefault(TT, CPU),
     308             :                         FS, Options, getEffectiveRelocModel(RM),
     309             :                         getEffectiveCodeModel(CM), OptLevel),
     310        5158 :       TLOF(createTLOF(getTargetTriple())) {
     311        5158 :   AS = AMDGPU::getAMDGPUAS(TT);
     312        2579 :   initAsmInfo();
     313        2579 : }
     314             : 
     315             : bool AMDGPUTargetMachine::EnableLateStructurizeCFG = false;
     316             : bool AMDGPUTargetMachine::EnableFunctionCalls = false;
     317             : 
     318             : AMDGPUTargetMachine::~AMDGPUTargetMachine() = default;
     319             : 
     320      618918 : StringRef AMDGPUTargetMachine::getGPUName(const Function &F) const {
     321      618918 :   Attribute GPUAttr = F.getFnAttribute("target-cpu");
     322      618918 :   return GPUAttr.hasAttribute(Attribute::None) ?
     323     1112286 :     getTargetCPU() : GPUAttr.getValueAsString();
     324             : }
     325             : 
     326      618918 : StringRef AMDGPUTargetMachine::getFeatureString(const Function &F) const {
     327      618918 :   Attribute FSAttr = F.getFnAttribute("target-features");
     328             : 
     329      618918 :   return FSAttr.hasAttribute(Attribute::None) ?
     330             :     getTargetFeatureString() :
     331      843621 :     FSAttr.getValueAsString();
     332             : }
     333             : 
     334         120 : static ImmutablePass *createAMDGPUExternalAAWrapperPass() {
     335       13210 :   return createExternalAAWrapperPass([](Pass &P, Function &, AAResults &AAR) {
     336       13090 :       if (auto *WrapperPass = P.getAnalysisIfAvailable<AMDGPUAAWrapperPass>())
     337       13090 :         AAR.addAAResult(WrapperPass->getResult());
     338       13330 :       });
     339             : }
     340             : 
     341             : /// Predicate for Internalize pass.
     342          14 : static bool mustPreserveGV(const GlobalValue &GV) {
     343             :   if (const Function *F = dyn_cast<Function>(&GV))
     344          20 :     return F->isDeclaration() || AMDGPU::isEntryFunctionCC(F->getCallingConv());
     345             : 
     346           4 :   return !GV.use_empty();
     347             : }
     348             : 
     349         109 : void AMDGPUTargetMachine::adjustPassManager(PassManagerBuilder &Builder) {
     350         109 :   Builder.DivergentTarget = true;
     351             : 
     352         109 :   bool EnableOpt = getOptLevel() > CodeGenOpt::None;
     353             :   bool Internalize = InternalizeSymbols;
     354         110 :   bool EarlyInline = EarlyInlineAll && EnableOpt && !EnableAMDGPUFunctionCalls;
     355         109 :   bool AMDGPUAA = EnableAMDGPUAliasAnalysis && EnableOpt;
     356         109 :   bool LibCallSimplify = EnableLibCallSimplify && EnableOpt;
     357             : 
     358         109 :   if (EnableAMDGPUFunctionCalls) {
     359           2 :     delete Builder.Inliner;
     360           2 :     Builder.Inliner = createAMDGPUFunctionInliningPass();
     361             :   }
     362             : 
     363         109 :   if (Internalize) {
     364             :     // If we're generating code, we always have the whole program available. The
     365             :     // relocations expected for externally visible functions aren't supported,
     366             :     // so make sure every non-entry function is hidden.
     367           4 :     Builder.addExtension(
     368             :       PassManagerBuilder::EP_EnabledOnOptLevel0,
     369           1 :       [](const PassManagerBuilder &, legacy::PassManagerBase &PM) {
     370           3 :         PM.add(createInternalizePass(mustPreserveGV));
     371           1 :       });
     372             :   }
     373             : 
     374         327 :   Builder.addExtension(
     375             :     PassManagerBuilder::EP_ModuleOptimizerEarly,
     376             :     [Internalize, EarlyInline, AMDGPUAA](const PassManagerBuilder &,
     377         180 :                                          legacy::PassManagerBase &PM) {
     378          60 :       if (AMDGPUAA) {
     379          60 :         PM.add(createAMDGPUAAWrapperPass());
     380          60 :         PM.add(createAMDGPUExternalAAWrapperPass());
     381             :       }
     382          60 :       PM.add(createAMDGPUUnifyMetadataPass());
     383          60 :       if (Internalize) {
     384           3 :         PM.add(createInternalizePass(mustPreserveGV));
     385           1 :         PM.add(createGlobalDCEPass());
     386             :       }
     387          60 :       if (EarlyInline)
     388           1 :         PM.add(createAMDGPUAlwaysInlinePass(false));
     389          60 :   });
     390             : 
     391         109 :   const auto &Opt = Options;
     392         327 :   Builder.addExtension(
     393             :     PassManagerBuilder::EP_EarlyAsPossible,
     394             :     [AMDGPUAA, LibCallSimplify, &Opt](const PassManagerBuilder &,
     395         278 :                                       legacy::PassManagerBase &PM) {
     396         109 :       if (AMDGPUAA) {
     397          60 :         PM.add(createAMDGPUAAWrapperPass());
     398          60 :         PM.add(createAMDGPUExternalAAWrapperPass());
     399             :       }
     400         109 :       PM.add(llvm::createAMDGPUUseNativeCallsPass());
     401         109 :       if (LibCallSimplify)
     402         120 :         PM.add(llvm::createAMDGPUSimplifyLibCallsPass(Opt));
     403         109 :   });
     404             : 
     405         218 :   Builder.addExtension(
     406             :     PassManagerBuilder::EP_CGSCCOptimizerLate,
     407          60 :     [](const PassManagerBuilder &, legacy::PassManagerBase &PM) {
     408             :       // Add infer address spaces pass to the opt pipeline after inlining
     409             :       // but before SROA to increase SROA opportunities.
     410          60 :       PM.add(createInferAddressSpacesPass());
     411             : 
     412             :       // This should run after inlining to have any chance of doing anything,
     413             :       // and before other cleanup optimizations.
     414          60 :       PM.add(createAMDGPULowerKernelAttributesPass());
     415          60 :   });
     416         109 : }
     417             : 
     418             : //===----------------------------------------------------------------------===//
     419             : // R600 Target Machine (R600 -> Cayman)
     420             : //===----------------------------------------------------------------------===//
     421             : 
     422         290 : R600TargetMachine::R600TargetMachine(const Target &T, const Triple &TT,
     423             :                                      StringRef CPU, StringRef FS,
     424             :                                      TargetOptions Options,
     425             :                                      Optional<Reloc::Model> RM,
     426             :                                      Optional<CodeModel::Model> CM,
     427         290 :                                      CodeGenOpt::Level OL, bool JIT)
     428         870 :     : AMDGPUTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL) {
     429             :   setRequiresStructuredCFG(true);
     430         290 : }
     431             : 
     432       55694 : const R600Subtarget *R600TargetMachine::getSubtargetImpl(
     433             :   const Function &F) const {
     434       55694 :   StringRef GPU = getGPUName(F);
     435       55694 :   StringRef FS = getFeatureString(F);
     436             : 
     437             :   SmallString<128> SubtargetKey(GPU);
     438             :   SubtargetKey.append(FS);
     439             : 
     440       55694 :   auto &I = SubtargetMap[SubtargetKey];
     441       55694 :   if (!I) {
     442             :     // This needs to be done before we create a new subtarget since any
     443             :     // creation will depend on the TM and the code generation flags on the
     444             :     // function that reside in TargetOptions.
     445         286 :     resetTargetOptions(F);
     446         286 :     I = llvm::make_unique<R600Subtarget>(TargetTriple, GPU, FS, *this);
     447             :   }
     448             : 
     449       55694 :   return I.get();
     450             : }
     451             : 
     452             : TargetTransformInfo
     453       39515 : R600TargetMachine::getTargetTransformInfo(const Function &F) {
     454       79030 :   return TargetTransformInfo(R600TTIImpl(this, F));
     455             : }
     456             : 
     457             : //===----------------------------------------------------------------------===//
     458             : // GCN Target Machine (SI+)
     459             : //===----------------------------------------------------------------------===//
     460             : 
     461        2289 : GCNTargetMachine::GCNTargetMachine(const Target &T, const Triple &TT,
     462             :                                    StringRef CPU, StringRef FS,
     463             :                                    TargetOptions Options,
     464             :                                    Optional<Reloc::Model> RM,
     465             :                                    Optional<CodeModel::Model> CM,
     466        2289 :                                    CodeGenOpt::Level OL, bool JIT)
     467        9156 :     : AMDGPUTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL) {}
     468             : 
     469      563224 : const SISubtarget *GCNTargetMachine::getSubtargetImpl(const Function &F) const {
     470      563224 :   StringRef GPU = getGPUName(F);
     471      563224 :   StringRef FS = getFeatureString(F);
     472             : 
     473             :   SmallString<128> SubtargetKey(GPU);
     474             :   SubtargetKey.append(FS);
     475             : 
     476      563224 :   auto &I = SubtargetMap[SubtargetKey];
     477      563224 :   if (!I) {
     478             :     // This needs to be done before we create a new subtarget since any
     479             :     // creation will depend on the TM and the code generation flags on the
     480             :     // function that reside in TargetOptions.
     481        2271 :     resetTargetOptions(F);
     482        2271 :     I = llvm::make_unique<SISubtarget>(TargetTriple, GPU, FS, *this);
     483             :   }
     484             : 
     485             :   I->setScalarizeGlobalBehavior(ScalarizeGlobal);
     486             : 
     487      563224 :   return I.get();
     488             : }
     489             : 
     490             : TargetTransformInfo
     491      383569 : GCNTargetMachine::getTargetTransformInfo(const Function &F) {
     492      767138 :   return TargetTransformInfo(GCNTTIImpl(this, F));
     493             : }
     494             : 
     495             : //===----------------------------------------------------------------------===//
     496             : // AMDGPU Pass Setup
     497             : //===----------------------------------------------------------------------===//
     498             : 
     499             : namespace {
     500             : 
     501        2451 : class AMDGPUPassConfig : public TargetPassConfig {
     502             : public:
     503        2462 :   AMDGPUPassConfig(LLVMTargetMachine &TM, PassManagerBase &PM)
     504        2462 :     : TargetPassConfig(TM, PM) {
     505             :     // Exceptions and StackMaps are not supported, so these passes will never do
     506             :     // anything.
     507        2462 :     disablePass(&StackMapLivenessID);
     508        2462 :     disablePass(&FuncletLayoutID);
     509        2462 :   }
     510             : 
     511             :   AMDGPUTargetMachine &getAMDGPUTargetMachine() const {
     512        4139 :     return getTM<AMDGPUTargetMachine>();
     513             :   }
     514             : 
     515             :   ScheduleDAGInstrs *
     516           0 :   createMachineScheduler(MachineSchedContext *C) const override {
     517           0 :     ScheduleDAGMILive *DAG = createGenericSchedLive(C);
     518           0 :     DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
     519           0 :     DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
     520           0 :     return DAG;
     521             :   }
     522             : 
     523             :   void addEarlyCSEOrGVNPass();
     524             :   void addStraightLineScalarOptimizationPasses();
     525             :   void addIRPasses() override;
     526             :   void addCodeGenPrepare() override;
     527             :   bool addPreISel() override;
     528             :   bool addInstSelector() override;
     529             :   bool addGCPasses() override;
     530             : };
     531             : 
     532         562 : class R600PassConfig final : public AMDGPUPassConfig {
     533             : public:
     534             :   R600PassConfig(LLVMTargetMachine &TM, PassManagerBase &PM)
     535         282 :     : AMDGPUPassConfig(TM, PM) {}
     536             : 
     537        2238 :   ScheduleDAGInstrs *createMachineScheduler(
     538             :     MachineSchedContext *C) const override {
     539        2238 :     return createR600MachineScheduler(C);
     540             :   }
     541             : 
     542             :   bool addPreISel() override;
     543             :   bool addInstSelector() override;
     544             :   void addPreRegAlloc() override;
     545             :   void addPreSched2() override;
     546             :   void addPreEmitPass() override;
     547             : };
     548             : 
     549        4340 : class GCNPassConfig final : public AMDGPUPassConfig {
     550             : public:
     551        2180 :   GCNPassConfig(LLVMTargetMachine &TM, PassManagerBase &PM)
     552        2180 :     : AMDGPUPassConfig(TM, PM) {
     553             :     // It is necessary to know the register usage of the entire call graph.  We
     554             :     // allow calls without EnableAMDGPUFunctionCalls if they are marked
     555             :     // noinline, so this is always required.
     556        2180 :     setRequiresCodeGenSCCOrder(true);
     557        2180 :   }
     558             : 
     559             :   GCNTargetMachine &getGCNTargetMachine() const {
     560             :     return getTM<GCNTargetMachine>();
     561             :   }
     562             : 
     563             :   ScheduleDAGInstrs *
     564             :   createMachineScheduler(MachineSchedContext *C) const override;
     565             : 
     566             :   bool addPreISel() override;
     567             :   void addMachineSSAOptimization() override;
     568             :   bool addILPOpts() override;
     569             :   bool addInstSelector() override;
     570             :   bool addIRTranslator() override;
     571             :   bool addLegalizeMachineIR() override;
     572             :   bool addRegBankSelect() override;
     573             :   bool addGlobalInstructionSelect() override;
     574             :   void addFastRegAlloc(FunctionPass *RegAllocPass) override;
     575             :   void addOptimizedRegAlloc(FunctionPass *RegAllocPass) override;
     576             :   void addPreRegAlloc() override;
     577             :   void addPostRegAlloc() override;
     578             :   void addPreSched2() override;
     579             :   void addPreEmitPass() override;
     580             : };
     581             : 
     582             : } // end anonymous namespace
     583             : 
     584        4056 : void AMDGPUPassConfig::addEarlyCSEOrGVNPass() {
     585        4056 :   if (getOptLevel() == CodeGenOpt::Aggressive)
     586           0 :     addPass(createGVNPass());
     587             :   else
     588        4056 :     addPass(createEarlyCSEPass());
     589        4056 : }
     590             : 
     591        2028 : void AMDGPUPassConfig::addStraightLineScalarOptimizationPasses() {
     592        2028 :   addPass(createLICMPass());
     593        2028 :   addPass(createSeparateConstOffsetFromGEPPass());
     594        2028 :   addPass(createSpeculativeExecutionPass());
     595             :   // ReassociateGEPs exposes more opportunites for SLSR. See
     596             :   // the example in reassociate-geps-and-slsr.ll.
     597        2028 :   addPass(createStraightLineStrengthReducePass());
     598             :   // SeparateConstOffsetFromGEP and SLSR creates common expressions which GVN or
     599             :   // EarlyCSE can reuse.
     600        2028 :   addEarlyCSEOrGVNPass();
     601             :   // Run NaryReassociate after EarlyCSE/GVN to be more effective.
     602        2028 :   addPass(createNaryReassociatePass());
     603             :   // NaryReassociate on GEPs creates redundant common expressions, so run
     604             :   // EarlyCSE after it.
     605        2028 :   addPass(createEarlyCSEPass());
     606        2028 : }
     607             : 
     608        2073 : void AMDGPUPassConfig::addIRPasses() {
     609             :   const AMDGPUTargetMachine &TM = getAMDGPUTargetMachine();
     610             : 
     611             :   // There is no reason to run these.
     612        2073 :   disablePass(&StackMapLivenessID);
     613        2073 :   disablePass(&FuncletLayoutID);
     614        2073 :   disablePass(&PatchableFunctionID);
     615             : 
     616        2073 :   addPass(createAMDGPULowerIntrinsicsPass());
     617             : 
     618        3868 :   if (TM.getTargetTriple().getArch() == Triple::r600 ||
     619             :       !EnableAMDGPUFunctionCalls) {
     620             :     // Function calls are not supported, so make sure we inline everything.
     621        2073 :     addPass(createAMDGPUAlwaysInlinePass());
     622        2073 :     addPass(createAlwaysInlinerLegacyPass());
     623             :     // We need to add the barrier noop pass, otherwise adding the function
     624             :     // inlining pass will cause all of the PassConfigs passes to be run
     625             :     // one function at a time, which means if we have a nodule with two
     626             :     // functions, then we will generate code for the first function
     627             :     // without ever running any passes on the second.
     628        2073 :     addPass(createBarrierNoopPass());
     629             :   }
     630             : 
     631        2073 :   if (TM.getTargetTriple().getArch() == Triple::amdgcn) {
     632             :     // TODO: May want to move later or split into an early and late one.
     633             : 
     634        1795 :     addPass(createAMDGPUCodeGenPreparePass());
     635             :   }
     636             : 
     637             :   // Handle uses of OpenCL image2d_t, image3d_t and sampler_t arguments.
     638        2073 :   if (TM.getTargetTriple().getArch() == Triple::r600)
     639         278 :     addPass(createR600OpenCLImageTypeLoweringPass());
     640             : 
     641             :   // Replace OpenCL enqueued block function pointers with global variables.
     642        2073 :   addPass(createAMDGPUOpenCLEnqueuedBlockLoweringPass());
     643             : 
     644        2073 :   if (TM.getOptLevel() > CodeGenOpt::None) {
     645        2028 :     addPass(createInferAddressSpacesPass());
     646        2028 :     addPass(createAMDGPUPromoteAlloca());
     647             : 
     648        2028 :     if (EnableSROA)
     649        2002 :       addPass(createSROAPass());
     650             : 
     651        2028 :     addStraightLineScalarOptimizationPasses();
     652             : 
     653        2028 :     if (EnableAMDGPUAliasAnalysis) {
     654        2015 :       addPass(createAMDGPUAAWrapperPass());
     655        4030 :       addPass(createExternalAAWrapperPass([](Pass &P, Function &,
     656      112399 :                                              AAResults &AAR) {
     657      112399 :         if (auto *WrapperPass = P.getAnalysisIfAvailable<AMDGPUAAWrapperPass>())
     658      112399 :           AAR.addAAResult(WrapperPass->getResult());
     659      112399 :         }));
     660             :     }
     661             :   }
     662             : 
     663        2073 :   TargetPassConfig::addIRPasses();
     664             : 
     665             :   // EarlyCSE is not always strong enough to clean up what LSR produces. For
     666             :   // example, GVN can combine
     667             :   //
     668             :   //   %0 = add %a, %b
     669             :   //   %1 = add %b, %a
     670             :   //
     671             :   // and
     672             :   //
     673             :   //   %0 = shl nsw %a, 2
     674             :   //   %1 = shl %a, 2
     675             :   //
     676             :   // but EarlyCSE can do neither of them.
     677        2073 :   if (getOptLevel() != CodeGenOpt::None)
     678        2028 :     addEarlyCSEOrGVNPass();
     679        2073 : }
     680             : 
     681        2073 : void AMDGPUPassConfig::addCodeGenPrepare() {
     682        3868 :   if (TM->getTargetTriple().getArch() == Triple::amdgcn &&
     683             :       EnableLowerKernelArguments)
     684        1794 :     addPass(createAMDGPULowerKernelArgumentsPass());
     685             : 
     686        2073 :   TargetPassConfig::addCodeGenPrepare();
     687             : 
     688        2073 :   if (EnableLoadStoreVectorizer)
     689        2062 :     addPass(createLoadStoreVectorizerPass());
     690        2073 : }
     691             : 
     692        2073 : bool AMDGPUPassConfig::addPreISel() {
     693        2073 :   addPass(createFlattenCFGPass());
     694        2073 :   return false;
     695             : }
     696             : 
     697        1788 : bool AMDGPUPassConfig::addInstSelector() {
     698        3576 :   addPass(createAMDGPUISelDag(&getAMDGPUTargetMachine(), getOptLevel()));
     699        1788 :   return false;
     700             : }
     701             : 
     702        2073 : bool AMDGPUPassConfig::addGCPasses() {
     703             :   // Do nothing. GC is not supported.
     704        2073 :   return false;
     705             : }
     706             : 
     707             : //===----------------------------------------------------------------------===//
     708             : // R600 Pass Setup
     709             : //===----------------------------------------------------------------------===//
     710             : 
     711         278 : bool R600PassConfig::addPreISel() {
     712         278 :   AMDGPUPassConfig::addPreISel();
     713             : 
     714         278 :   if (EnableR600StructurizeCFG)
     715         276 :     addPass(createStructurizeCFGPass());
     716         278 :   return false;
     717             : }
     718             : 
     719         278 : bool R600PassConfig::addInstSelector() {
     720         556 :   addPass(createR600ISelDag(&getAMDGPUTargetMachine(), getOptLevel()));
     721         278 :   return false;
     722             : }
     723             : 
     724         278 : void R600PassConfig::addPreRegAlloc() {
     725         278 :   addPass(createR600VectorRegMerger());
     726         278 : }
     727             : 
     728         278 : void R600PassConfig::addPreSched2() {
     729         278 :   addPass(createR600EmitClauseMarkers(), false);
     730         278 :   if (EnableR600IfConvert)
     731         277 :     addPass(&IfConverterID, false);
     732         278 :   addPass(createR600ClauseMergePass(), false);
     733         278 : }
     734             : 
     735         278 : void R600PassConfig::addPreEmitPass() {
     736         278 :   addPass(createAMDGPUCFGStructurizerPass(), false);
     737         278 :   addPass(createR600ExpandSpecialInstrsPass(), false);
     738         278 :   addPass(&FinalizeMachineBundlesID, false);
     739         278 :   addPass(createR600Packetizer(), false);
     740         278 :   addPass(createR600ControlFlowFinalizer(), false);
     741         278 : }
     742             : 
     743         282 : TargetPassConfig *R600TargetMachine::createPassConfig(PassManagerBase &PM) {
     744         564 :   return new R600PassConfig(*this, PM);
     745             : }
     746             : 
     747             : //===----------------------------------------------------------------------===//
     748             : // GCN Pass Setup
     749             : //===----------------------------------------------------------------------===//
     750             : 
     751       17674 : ScheduleDAGInstrs *GCNPassConfig::createMachineScheduler(
     752             :   MachineSchedContext *C) const {
     753       17674 :   const SISubtarget &ST = C->MF->getSubtarget<SISubtarget>();
     754       17674 :   if (ST.enableSIScheduler())
     755           0 :     return createSIMachineScheduler(C);
     756       17674 :   return createGCNMaxOccupancyMachineScheduler(C);
     757             : }
     758             : 
     759        1795 : bool GCNPassConfig::addPreISel() {
     760        1795 :   AMDGPUPassConfig::addPreISel();
     761             : 
     762             :   // FIXME: We need to run a pass to propagate the attributes when calls are
     763             :   // supported.
     764        1795 :   addPass(createAMDGPUAnnotateKernelFeaturesPass());
     765             : 
     766             :   // Merge divergent exit nodes. StructurizeCFG won't recognize the multi-exit
     767             :   // regions formed by them.
     768        1795 :   addPass(&AMDGPUUnifyDivergentExitNodesID);
     769        1795 :   if (!LateCFGStructurize) {
     770        1795 :     addPass(createStructurizeCFGPass(true)); // true -> SkipUniformRegions
     771             :   }
     772        1795 :   addPass(createSinkingPass());
     773        1795 :   addPass(createAMDGPUAnnotateUniformValues());
     774        1795 :   if (!LateCFGStructurize) {
     775        1795 :     addPass(createSIAnnotateControlFlowPass());
     776             :   }
     777             : 
     778        1795 :   return false;
     779             : }
     780             : 
     781        1751 : void GCNPassConfig::addMachineSSAOptimization() {
     782        1751 :   TargetPassConfig::addMachineSSAOptimization();
     783             : 
     784             :   // We want to fold operands after PeepholeOptimizer has run (or as part of
     785             :   // it), because it will eliminate extra copies making it easier to fold the
     786             :   // real source operand. We want to eliminate dead instructions after, so that
     787             :   // we see fewer uses of the copies. We then need to clean up the dead
     788             :   // instructions leftover after the operands are folded as well.
     789             :   //
     790             :   // XXX - Can we get away without running DeadMachineInstructionElim again?
     791        1751 :   addPass(&SIFoldOperandsID);
     792        1751 :   addPass(&DeadMachineInstructionElimID);
     793        1751 :   addPass(&SILoadStoreOptimizerID);
     794        1751 :   if (EnableSDWAPeephole) {
     795        1747 :     addPass(&SIPeepholeSDWAID);
     796        1747 :     addPass(&EarlyMachineLICMID);
     797        1747 :     addPass(&MachineCSEID);
     798        1747 :     addPass(&SIFoldOperandsID);
     799        1747 :     addPass(&DeadMachineInstructionElimID);
     800             :   }
     801        1751 :   addPass(createSIShrinkInstructionsPass());
     802        1751 : }
     803             : 
     804        1751 : bool GCNPassConfig::addILPOpts() {
     805        1751 :   if (EnableEarlyIfConversion)
     806           2 :     addPass(&EarlyIfConverterID);
     807             : 
     808             :   TargetPassConfig::addILPOpts();
     809        1751 :   return false;
     810             : }
     811             : 
     812        1788 : bool GCNPassConfig::addInstSelector() {
     813        1788 :   AMDGPUPassConfig::addInstSelector();
     814        1788 :   addPass(createSILowerI1CopiesPass());
     815        1788 :   addPass(&SIFixSGPRCopiesID);
     816        1788 :   return false;
     817             : }
     818             : 
     819           8 : bool GCNPassConfig::addIRTranslator() {
     820           8 :   addPass(new IRTranslator());
     821           8 :   return false;
     822             : }
     823             : 
     824           8 : bool GCNPassConfig::addLegalizeMachineIR() {
     825           8 :   addPass(new Legalizer());
     826           8 :   return false;
     827             : }
     828             : 
     829           8 : bool GCNPassConfig::addRegBankSelect() {
     830           8 :   addPass(new RegBankSelect());
     831           8 :   return false;
     832             : }
     833             : 
     834           8 : bool GCNPassConfig::addGlobalInstructionSelect() {
     835           8 :   addPass(new InstructionSelect());
     836           8 :   return false;
     837             : }
     838             : 
     839        1795 : void GCNPassConfig::addPreRegAlloc() {
     840        1795 :   if (LateCFGStructurize) {
     841           0 :     addPass(createAMDGPUMachineCFGStructurizerPass());
     842             :   }
     843        1795 :   addPass(createSIWholeQuadModePass());
     844        1795 : }
     845             : 
     846          44 : void GCNPassConfig::addFastRegAlloc(FunctionPass *RegAllocPass) {
     847             :   // FIXME: We have to disable the verifier here because of PHIElimination +
     848             :   // TwoAddressInstructions disabling it.
     849             : 
     850             :   // This must be run immediately after phi elimination and before
     851             :   // TwoAddressInstructions, otherwise the processing of the tied operand of
     852             :   // SI_ELSE will introduce a copy of the tied operand source after the else.
     853          88 :   insertPass(&PHIEliminationID, &SILowerControlFlowID, false);
     854             : 
     855             :   // This must be run after SILowerControlFlow, since it needs to use the
     856             :   // machine-level CFG, but before register allocation.
     857          88 :   insertPass(&SILowerControlFlowID, &SIFixWWMLivenessID, false);
     858             : 
     859          44 :   TargetPassConfig::addFastRegAlloc(RegAllocPass);
     860          44 : }
     861             : 
     862        1751 : void GCNPassConfig::addOptimizedRegAlloc(FunctionPass *RegAllocPass) {
     863        3502 :   insertPass(&MachineSchedulerID, &SIOptimizeExecMaskingPreRAID);
     864             : 
     865        3502 :   insertPass(&SIOptimizeExecMaskingPreRAID, &SIFormMemoryClausesID);
     866             : 
     867             :   // This must be run immediately after phi elimination and before
     868             :   // TwoAddressInstructions, otherwise the processing of the tied operand of
     869             :   // SI_ELSE will introduce a copy of the tied operand source after the else.
     870        3502 :   insertPass(&PHIEliminationID, &SILowerControlFlowID, false);
     871             : 
     872             :   // This must be run after SILowerControlFlow, since it needs to use the
     873             :   // machine-level CFG, but before register allocation.
     874        3502 :   insertPass(&SILowerControlFlowID, &SIFixWWMLivenessID, false);
     875             : 
     876        1751 :   TargetPassConfig::addOptimizedRegAlloc(RegAllocPass);
     877        1751 : }
     878             : 
     879        1795 : void GCNPassConfig::addPostRegAlloc() {
     880        1795 :   addPass(&SIFixVGPRCopiesID);
     881        1795 :   addPass(&SIOptimizeExecMaskingID);
     882             :   TargetPassConfig::addPostRegAlloc();
     883        1795 : }
     884             : 
     885        1795 : void GCNPassConfig::addPreSched2() {
     886        1795 : }
     887             : 
     888        1795 : void GCNPassConfig::addPreEmitPass() {
     889             :   // The hazard recognizer that runs as part of the post-ra scheduler does not
     890             :   // guarantee to be able handle all hazards correctly. This is because if there
     891             :   // are multiple scheduling regions in a basic block, the regions are scheduled
     892             :   // bottom up, so when we begin to schedule a region we don't know what
     893             :   // instructions were emitted directly before it.
     894             :   //
     895             :   // Here we add a stand-alone hazard recognizer pass which can handle all
     896             :   // cases.
     897        1795 :   addPass(&PostRAHazardRecognizerID);
     898             : 
     899        1795 :   addPass(createSIMemoryLegalizerPass());
     900        1795 :   addPass(createSIInsertWaitcntsPass());
     901        1795 :   addPass(createSIShrinkInstructionsPass());
     902        1795 :   addPass(&SIInsertSkipsPassID);
     903        1795 :   addPass(createSIDebuggerInsertNopsPass());
     904        1795 :   addPass(&BranchRelaxationPassID);
     905        1795 : }
     906             : 
     907        2180 : TargetPassConfig *GCNTargetMachine::createPassConfig(PassManagerBase &PM) {
     908        2180 :   return new GCNPassConfig(*this, PM);
     909      299229 : }

Generated by: LCOV version 1.13