LCOV - code coverage report
Current view: top level - lib/Target/AMDGPU - AMDGPUTargetMachine.cpp (source / functions) Hit Total Coverage
Test: llvm-toolchain.info Lines: 403 413 97.6 %
Date: 2017-09-14 15:23:50 Functions: 58 63 92.1 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : //===-- AMDGPUTargetMachine.cpp - TargetMachine for hw codegen targets-----===//
       2             : //
       3             : //                     The LLVM Compiler Infrastructure
       4             : //
       5             : // This file is distributed under the University of Illinois Open Source
       6             : // License. See LICENSE.TXT for details.
       7             : //
       8             : //===----------------------------------------------------------------------===//
       9             : //
      10             : /// \file
      11             : /// \brief The AMDGPU target machine contains all of the hardware specific
      12             : /// information  needed to emit code for R600 and SI GPUs.
      13             : //
      14             : //===----------------------------------------------------------------------===//
      15             : 
      16             : #include "AMDGPUTargetMachine.h"
      17             : #include "AMDGPU.h"
      18             : #include "AMDGPUAliasAnalysis.h"
      19             : #include "AMDGPUCallLowering.h"
      20             : #include "AMDGPUInstructionSelector.h"
      21             : #include "AMDGPULegalizerInfo.h"
      22             : #include "AMDGPUMacroFusion.h"
      23             : #include "AMDGPUTargetObjectFile.h"
      24             : #include "AMDGPUTargetTransformInfo.h"
      25             : #include "GCNIterativeScheduler.h"
      26             : #include "GCNSchedStrategy.h"
      27             : #include "R600MachineScheduler.h"
      28             : #include "SIMachineScheduler.h"
      29             : #include "llvm/CodeGen/GlobalISel/IRTranslator.h"
      30             : #include "llvm/CodeGen/GlobalISel/InstructionSelect.h"
      31             : #include "llvm/CodeGen/GlobalISel/Legalizer.h"
      32             : #include "llvm/CodeGen/GlobalISel/RegBankSelect.h"
      33             : #include "llvm/CodeGen/Passes.h"
      34             : #include "llvm/CodeGen/TargetPassConfig.h"
      35             : #include "llvm/IR/Attributes.h"
      36             : #include "llvm/IR/Function.h"
      37             : #include "llvm/IR/LegacyPassManager.h"
      38             : #include "llvm/Pass.h"
      39             : #include "llvm/Support/CommandLine.h"
      40             : #include "llvm/Support/Compiler.h"
      41             : #include "llvm/Support/TargetRegistry.h"
      42             : #include "llvm/Target/TargetLoweringObjectFile.h"
      43             : #include "llvm/Transforms/IPO.h"
      44             : #include "llvm/Transforms/IPO/AlwaysInliner.h"
      45             : #include "llvm/Transforms/IPO/PassManagerBuilder.h"
      46             : #include "llvm/Transforms/Scalar.h"
      47             : #include "llvm/Transforms/Scalar/GVN.h"
      48             : #include "llvm/Transforms/Vectorize.h"
      49             : #include <memory>
      50             : 
      51             : using namespace llvm;
      52             : 
      53       72306 : static cl::opt<bool> EnableR600StructurizeCFG(
      54             :   "r600-ir-structurize",
      55      216918 :   cl::desc("Use StructurizeCFG IR pass"),
      56      289224 :   cl::init(true));
      57             : 
      58       72306 : static cl::opt<bool> EnableSROA(
      59             :   "amdgpu-sroa",
      60      216918 :   cl::desc("Run SROA after promote alloca pass"),
      61             :   cl::ReallyHidden,
      62      289224 :   cl::init(true));
      63             : 
      64             : static cl::opt<bool>
      65       72306 : EnableEarlyIfConversion("amdgpu-early-ifcvt", cl::Hidden,
      66      216918 :                         cl::desc("Run early if-conversion"),
      67      289224 :                         cl::init(false));
      68             : 
      69       72306 : static cl::opt<bool> EnableR600IfConvert(
      70             :   "r600-if-convert",
      71      216918 :   cl::desc("Use if conversion pass"),
      72             :   cl::ReallyHidden,
      73      289224 :   cl::init(true));
      74             : 
      75             : // Option to disable vectorizer for tests.
      76       72306 : static cl::opt<bool> EnableLoadStoreVectorizer(
      77             :   "amdgpu-load-store-vectorizer",
      78      216918 :   cl::desc("Enable load store vectorizer"),
      79      216918 :   cl::init(true),
      80      216918 :   cl::Hidden);
      81             : 
      82             : // Option to to control global loads scalarization
      83       72306 : static cl::opt<bool> ScalarizeGlobal(
      84             :   "amdgpu-scalarize-global-loads",
      85      216918 :   cl::desc("Enable global load scalarization"),
      86      216918 :   cl::init(true),
      87      216918 :   cl::Hidden);
      88             : 
      89             : // Option to run internalize pass.
      90       72306 : static cl::opt<bool> InternalizeSymbols(
      91             :   "amdgpu-internalize-symbols",
      92      216918 :   cl::desc("Enable elimination of non-kernel functions and unused globals"),
      93      216918 :   cl::init(false),
      94      216918 :   cl::Hidden);
      95             : 
      96             : // Option to inline all early.
      97       72306 : static cl::opt<bool> EarlyInlineAll(
      98             :   "amdgpu-early-inline-all",
      99      216918 :   cl::desc("Inline all functions early"),
     100      216918 :   cl::init(false),
     101      216918 :   cl::Hidden);
     102             : 
     103       72306 : static cl::opt<bool> EnableSDWAPeephole(
     104             :   "amdgpu-sdwa-peephole",
     105      216918 :   cl::desc("Enable SDWA peepholer"),
     106      289224 :   cl::init(true));
     107             : 
     108             : // Enable address space based alias analysis
     109       72306 : static cl::opt<bool> EnableAMDGPUAliasAnalysis("enable-amdgpu-aa", cl::Hidden,
     110      216918 :   cl::desc("Enable AMDGPU Alias Analysis"),
     111      289224 :   cl::init(true));
     112             : 
     113             : // Option to enable new waitcnt insertion pass.
     114       72306 : static cl::opt<bool> EnableSIInsertWaitcntsPass(
     115             :   "enable-si-insert-waitcnts",
     116      216918 :   cl::desc("Use new waitcnt insertion pass"),
     117      289224 :   cl::init(true));
     118             : 
     119             : // Option to run late CFG structurizer
     120       72306 : static cl::opt<bool> LateCFGStructurize(
     121             :   "amdgpu-late-structurize",
     122      216918 :   cl::desc("Enable late CFG structurization"),
     123      216918 :   cl::init(false),
     124      216918 :   cl::Hidden);
     125             : 
     126       72306 : static cl::opt<bool> EnableAMDGPUFunctionCalls(
     127             :   "amdgpu-function-calls",
     128             :   cl::Hidden,
     129      216918 :   cl::desc("Enable AMDGPU function call support"),
     130      289224 :   cl::init(false));
     131             : 
     132             : // Enable lib calls simplifications
     133       72306 : static cl::opt<bool> EnableLibCallSimplify(
     134             :   "amdgpu-simplify-libcall",
     135      216918 :   cl::desc("Enable mdgpu library simplifications"),
     136      216918 :   cl::init(true),
     137      216918 :   cl::Hidden);
     138             : 
     139       68818 : extern "C" void LLVMInitializeAMDGPUTarget() {
     140             :   // Register the target
     141      137636 :   RegisterTargetMachine<R600TargetMachine> X(getTheAMDGPUTarget());
     142      137636 :   RegisterTargetMachine<GCNTargetMachine> Y(getTheGCNTarget());
     143             : 
     144       68818 :   PassRegistry *PR = PassRegistry::getPassRegistry();
     145       68818 :   initializeR600ClauseMergePassPass(*PR);
     146       68818 :   initializeR600ControlFlowFinalizerPass(*PR);
     147       68818 :   initializeR600PacketizerPass(*PR);
     148       68818 :   initializeR600ExpandSpecialInstrsPassPass(*PR);
     149       68818 :   initializeR600VectorRegMergerPass(*PR);
     150       68818 :   initializeAMDGPUDAGToDAGISelPass(*PR);
     151       68818 :   initializeSILowerI1CopiesPass(*PR);
     152       68818 :   initializeSIFixSGPRCopiesPass(*PR);
     153       68818 :   initializeSIFixVGPRCopiesPass(*PR);
     154       68818 :   initializeSIFoldOperandsPass(*PR);
     155       68818 :   initializeSIPeepholeSDWAPass(*PR);
     156       68818 :   initializeSIShrinkInstructionsPass(*PR);
     157       68818 :   initializeSIOptimizeExecMaskingPreRAPass(*PR);
     158       68818 :   initializeSILoadStoreOptimizerPass(*PR);
     159       68818 :   initializeAMDGPUAlwaysInlinePass(*PR);
     160       68818 :   initializeAMDGPUAnnotateKernelFeaturesPass(*PR);
     161       68818 :   initializeAMDGPUAnnotateUniformValuesPass(*PR);
     162       68818 :   initializeAMDGPUArgumentUsageInfoPass(*PR);
     163       68818 :   initializeAMDGPULowerIntrinsicsPass(*PR);
     164       68818 :   initializeAMDGPUPromoteAllocaPass(*PR);
     165       68818 :   initializeAMDGPUCodeGenPreparePass(*PR);
     166       68818 :   initializeAMDGPURewriteOutArgumentsPass(*PR);
     167       68818 :   initializeAMDGPUUnifyMetadataPass(*PR);
     168       68818 :   initializeSIAnnotateControlFlowPass(*PR);
     169       68818 :   initializeSIInsertWaitsPass(*PR);
     170       68818 :   initializeSIInsertWaitcntsPass(*PR);
     171       68818 :   initializeSIWholeQuadModePass(*PR);
     172       68818 :   initializeSILowerControlFlowPass(*PR);
     173       68818 :   initializeSIInsertSkipsPass(*PR);
     174       68818 :   initializeSIMemoryLegalizerPass(*PR);
     175       68818 :   initializeSIDebuggerInsertNopsPass(*PR);
     176       68818 :   initializeSIOptimizeExecMaskingPass(*PR);
     177       68818 :   initializeSIFixWWMLivenessPass(*PR);
     178       68818 :   initializeAMDGPUUnifyDivergentExitNodesPass(*PR);
     179       68818 :   initializeAMDGPUAAWrapperPassPass(*PR);
     180       68818 :   initializeAMDGPUUseNativeCallsPass(*PR);
     181       68818 :   initializeAMDGPUSimplifyLibCallsPass(*PR);
     182       68818 : }
     183             : 
     184             : static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) {
     185        6177 :   return llvm::make_unique<AMDGPUTargetObjectFile>();
     186             : }
     187             : 
     188        2057 : static ScheduleDAGInstrs *createR600MachineScheduler(MachineSchedContext *C) {
     189        6171 :   return new ScheduleDAGMILive(C, llvm::make_unique<R600SchedStrategy>());
     190             : }
     191             : 
     192           1 : static ScheduleDAGInstrs *createSIMachineScheduler(MachineSchedContext *C) {
     193           1 :   return new SIScheduleDAGMI(C);
     194             : }
     195             : 
     196             : static ScheduleDAGInstrs *
     197       14647 : createGCNMaxOccupancyMachineScheduler(MachineSchedContext *C) {
     198             :   ScheduleDAGMILive *DAG =
     199       73235 :     new GCNScheduleDAGMILive(C, make_unique<GCNMaxOccupancySchedStrategy>(C));
     200       43941 :   DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
     201       43941 :   DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
     202       43941 :   DAG->addMutation(createAMDGPUMacroFusionDAGMutation());
     203       14647 :   return DAG;
     204             : }
     205             : 
     206             : static ScheduleDAGInstrs *
     207           3 : createIterativeGCNMaxOccupancyMachineScheduler(MachineSchedContext *C) {
     208             :   auto DAG = new GCNIterativeScheduler(C,
     209           3 :     GCNIterativeScheduler::SCHEDULE_LEGACYMAXOCCUPANCY);
     210           9 :   DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
     211           9 :   DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
     212           3 :   return DAG;
     213             : }
     214             : 
     215           3 : static ScheduleDAGInstrs *createMinRegScheduler(MachineSchedContext *C) {
     216             :   return new GCNIterativeScheduler(C,
     217           3 :     GCNIterativeScheduler::SCHEDULE_MINREGFORCED);
     218             : }
     219             : 
     220             : static MachineSchedRegistry
     221       72306 : R600SchedRegistry("r600", "Run R600's custom scheduler",
     222       72306 :                    createR600MachineScheduler);
     223             : 
     224             : static MachineSchedRegistry
     225       72306 : SISchedRegistry("si", "Run SI's custom scheduler",
     226       72306 :                 createSIMachineScheduler);
     227             : 
     228             : static MachineSchedRegistry
     229       72306 : GCNMaxOccupancySchedRegistry("gcn-max-occupancy",
     230             :                              "Run GCN scheduler to maximize occupancy",
     231       72306 :                              createGCNMaxOccupancyMachineScheduler);
     232             : 
     233             : static MachineSchedRegistry
     234       72306 : IterativeGCNMaxOccupancySchedRegistry("gcn-max-occupancy-experimental",
     235             :   "Run GCN scheduler to maximize occupancy (experimental)",
     236       72306 :   createIterativeGCNMaxOccupancyMachineScheduler);
     237             : 
     238             : static MachineSchedRegistry
     239       72306 : GCNMinRegSchedRegistry("gcn-minreg",
     240             :   "Run GCN iterative scheduler for minimal register usage (experimental)",
     241       72306 :   createMinRegScheduler);
     242             : 
     243        2059 : static StringRef computeDataLayout(const Triple &TT) {
     244        2059 :   if (TT.getArch() == Triple::r600) {
     245             :     // 32-bit pointers.
     246             :     return "e-p:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128"
     247         255 :             "-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64";
     248             :   }
     249             : 
     250             :   // 32-bit private, local, and region pointers. 64-bit global, constant and
     251             :   // flat.
     252        1804 :   if (TT.getEnvironmentName() == "amdgiz" ||
     253        1796 :       TT.getEnvironmentName() == "amdgizcl")
     254             :     return "e-p:64:64-p1:64:64-p2:64:64-p3:32:32-p4:32:32-p5:32:32"
     255             :          "-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128"
     256          24 :          "-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-A5";
     257             :   return "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32"
     258             :       "-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128"
     259        1780 :       "-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64";
     260             : }
     261             : 
     262             : LLVM_READNONE
     263             : static StringRef getGPUOrDefault(const Triple &TT, StringRef GPU) {
     264        2059 :   if (!GPU.empty())
     265             :     return GPU;
     266             : 
     267         634 :   if (TT.getArch() == Triple::amdgcn)
     268             :     return "generic";
     269             : 
     270          28 :   return "r600";
     271             : }
     272             : 
     273             : static Reloc::Model getEffectiveRelocModel(Optional<Reloc::Model> RM) {
     274             :   // The AMDGPU toolchain only supports generating shared objects, so we
     275             :   // must always use PIC.
     276             :   return Reloc::PIC_;
     277             : }
     278             : 
     279             : static CodeModel::Model getEffectiveCodeModel(Optional<CodeModel::Model> CM) {
     280        2059 :   if (CM)
     281           0 :     return *CM;
     282             :   return CodeModel::Small;
     283             : }
     284             : 
     285        2059 : AMDGPUTargetMachine::AMDGPUTargetMachine(const Target &T, const Triple &TT,
     286             :                                          StringRef CPU, StringRef FS,
     287             :                                          TargetOptions Options,
     288             :                                          Optional<Reloc::Model> RM,
     289             :                                          Optional<CodeModel::Model> CM,
     290        2059 :                                          CodeGenOpt::Level OptLevel)
     291             :     : LLVMTargetMachine(T, computeDataLayout(TT), TT, getGPUOrDefault(TT, CPU),
     292             :                         FS, Options, getEffectiveRelocModel(RM),
     293             :                         getEffectiveCodeModel(CM), OptLevel),
     294       16472 :       TLOF(createTLOF(getTargetTriple())) {
     295        4118 :   AS = AMDGPU::getAMDGPUAS(TT);
     296        2059 :   initAsmInfo();
     297        2059 : }
     298             : 
     299             : AMDGPUTargetMachine::~AMDGPUTargetMachine() = default;
     300             : 
     301      401798 : StringRef AMDGPUTargetMachine::getGPUName(const Function &F) const {
     302      803596 :   Attribute GPUAttr = F.getFnAttribute("target-cpu");
     303      401798 :   return GPUAttr.hasAttribute(Attribute::None) ?
     304      710043 :     getTargetCPU() : GPUAttr.getValueAsString();
     305             : }
     306             : 
     307      401798 : StringRef AMDGPUTargetMachine::getFeatureString(const Function &F) const {
     308      803596 :   Attribute FSAttr = F.getFnAttribute("target-features");
     309             : 
     310      401798 :   return FSAttr.hasAttribute(Attribute::None) ?
     311      254402 :     getTargetFeatureString() :
     312      549194 :     FSAttr.getValueAsString();
     313             : }
     314             : 
     315         106 : static ImmutablePass *createAMDGPUExternalAAWrapperPass() {
     316       12628 :   return createExternalAAWrapperPass([](Pass &P, Function &, AAResults &AAR) {
     317       12310 :       if (auto *WrapperPass = P.getAnalysisIfAvailable<AMDGPUAAWrapperPass>())
     318       12310 :         AAR.addAAResult(WrapperPass->getResult());
     319       12522 :       });
     320             : }
     321             : 
     322          94 : void AMDGPUTargetMachine::adjustPassManager(PassManagerBuilder &Builder) {
     323          94 :   Builder.DivergentTarget = true;
     324             : 
     325          94 :   bool EnableOpt = getOptLevel() > CodeGenOpt::None;
     326          95 :   bool Internalize = InternalizeSymbols && EnableOpt &&
     327          95 :                      (getTargetTriple().getArch() == Triple::amdgcn);
     328          94 :   bool EarlyInline = EarlyInlineAll && EnableOpt;
     329          94 :   bool AMDGPUAA = EnableAMDGPUAliasAnalysis && EnableOpt;
     330          94 :   bool LibCallSimplify = EnableLibCallSimplify && EnableOpt;
     331             : 
     332         282 :   Builder.addExtension(
     333             :     PassManagerBuilder::EP_ModuleOptimizerEarly,
     334             :     [Internalize, EarlyInline, AMDGPUAA](const PassManagerBuilder &,
     335         159 :                                          legacy::PassManagerBase &PM) {
     336          53 :       if (AMDGPUAA) {
     337          53 :         PM.add(createAMDGPUAAWrapperPass());
     338          53 :         PM.add(createAMDGPUExternalAAWrapperPass());
     339             :       }
     340          53 :       PM.add(createAMDGPUUnifyMetadataPass());
     341          53 :       if (Internalize) {
     342           8 :         PM.add(createInternalizePass([=](const GlobalValue &GV) -> bool {
     343           5 :           if (const Function *F = dyn_cast<Function>(&GV)) {
     344           3 :             if (F->isDeclaration())
     345             :                 return true;
     346           3 :             switch (F->getCallingConv()) {
     347             :             default:
     348             :               return false;
     349           3 :             case CallingConv::AMDGPU_VS:
     350             :             case CallingConv::AMDGPU_HS:
     351             :             case CallingConv::AMDGPU_GS:
     352             :             case CallingConv::AMDGPU_PS:
     353             :             case CallingConv::AMDGPU_CS:
     354             :             case CallingConv::AMDGPU_KERNEL:
     355             :             case CallingConv::SPIR_KERNEL:
     356             :               return true;
     357             :             }
     358             :           }
     359           4 :           return !GV.use_empty();
     360           1 :         }));
     361           1 :         PM.add(createGlobalDCEPass());
     362             :       }
     363          53 :       if (EarlyInline)
     364           1 :         PM.add(createAMDGPUAlwaysInlinePass(false));
     365          53 :   });
     366             : 
     367         282 :   Builder.addExtension(
     368             :     PassManagerBuilder::EP_EarlyAsPossible,
     369             :     [AMDGPUAA, LibCallSimplify](const PassManagerBuilder &,
     370         188 :                                 legacy::PassManagerBase &PM) {
     371          94 :       if (AMDGPUAA) {
     372          53 :         PM.add(createAMDGPUAAWrapperPass());
     373          53 :         PM.add(createAMDGPUExternalAAWrapperPass());
     374             :       }
     375          94 :       PM.add(llvm::createAMDGPUUseNativeCallsPass());
     376          94 :       if (LibCallSimplify)
     377          53 :         PM.add(llvm::createAMDGPUSimplifyLibCallsPass());
     378          94 :   });
     379             : 
     380         282 :   Builder.addExtension(
     381             :     PassManagerBuilder::EP_CGSCCOptimizerLate,
     382             :     [](const PassManagerBuilder &, legacy::PassManagerBase &PM) {
     383             :       // Add infer address spaces pass to the opt pipeline after inlining
     384             :       // but before SROA to increase SROA opportunities.
     385          53 :       PM.add(createInferAddressSpacesPass());
     386             :   });
     387          94 : }
     388             : 
     389             : //===----------------------------------------------------------------------===//
     390             : // R600 Target Machine (R600 -> Cayman)
     391             : //===----------------------------------------------------------------------===//
     392             : 
     393         255 : R600TargetMachine::R600TargetMachine(const Target &T, const Triple &TT,
     394             :                                      StringRef CPU, StringRef FS,
     395             :                                      TargetOptions Options,
     396             :                                      Optional<Reloc::Model> RM,
     397             :                                      Optional<CodeModel::Model> CM,
     398         255 :                                      CodeGenOpt::Level OL, bool JIT)
     399        1785 :     : AMDGPUTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL) {
     400         510 :   setRequiresStructuredCFG(true);
     401         255 : }
     402             : 
     403       38531 : const R600Subtarget *R600TargetMachine::getSubtargetImpl(
     404             :   const Function &F) const {
     405       38531 :   StringRef GPU = getGPUName(F);
     406       38531 :   StringRef FS = getFeatureString(F);
     407             : 
     408       77062 :   SmallString<128> SubtargetKey(GPU);
     409       38531 :   SubtargetKey.append(FS);
     410             : 
     411       77062 :   auto &I = SubtargetMap[SubtargetKey];
     412       38531 :   if (!I) {
     413             :     // This needs to be done before we create a new subtarget since any
     414             :     // creation will depend on the TM and the code generation flags on the
     415             :     // function that reside in TargetOptions.
     416         253 :     resetTargetOptions(F);
     417         253 :     I = llvm::make_unique<R600Subtarget>(TargetTriple, GPU, FS, *this);
     418             :   }
     419             : 
     420       77062 :   return I.get();
     421             : }
     422             : 
     423             : //===----------------------------------------------------------------------===//
     424             : // GCN Target Machine (SI+)
     425             : //===----------------------------------------------------------------------===//
     426             : 
     427        1804 : GCNTargetMachine::GCNTargetMachine(const Target &T, const Triple &TT,
     428             :                                    StringRef CPU, StringRef FS,
     429             :                                    TargetOptions Options,
     430             :                                    Optional<Reloc::Model> RM,
     431             :                                    Optional<CodeModel::Model> CM,
     432        1804 :                                    CodeGenOpt::Level OL, bool JIT)
     433       12628 :     : AMDGPUTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL) {}
     434             : 
     435      363267 : const SISubtarget *GCNTargetMachine::getSubtargetImpl(const Function &F) const {
     436      363267 :   StringRef GPU = getGPUName(F);
     437      363267 :   StringRef FS = getFeatureString(F);
     438             : 
     439      726534 :   SmallString<128> SubtargetKey(GPU);
     440      363267 :   SubtargetKey.append(FS);
     441             : 
     442      726534 :   auto &I = SubtargetMap[SubtargetKey];
     443      363267 :   if (!I) {
     444             :     // This needs to be done before we create a new subtarget since any
     445             :     // creation will depend on the TM and the code generation flags on the
     446             :     // function that reside in TargetOptions.
     447        1796 :     resetTargetOptions(F);
     448        1796 :     I = llvm::make_unique<SISubtarget>(TargetTriple, GPU, FS, *this);
     449             :   }
     450             : 
     451     1089801 :   I->setScalarizeGlobalBehavior(ScalarizeGlobal);
     452             : 
     453      726534 :   return I.get();
     454             : }
     455             : 
     456             : //===----------------------------------------------------------------------===//
     457             : // AMDGPU Pass Setup
     458             : //===----------------------------------------------------------------------===//
     459             : 
     460             : namespace {
     461             : 
     462        1947 : class AMDGPUPassConfig : public TargetPassConfig {
     463             : public:
     464        1956 :   AMDGPUPassConfig(LLVMTargetMachine &TM, PassManagerBase &PM)
     465        1956 :     : TargetPassConfig(TM, PM) {
     466             :     // Exceptions and StackMaps are not supported, so these passes will never do
     467             :     // anything.
     468        3912 :     disablePass(&StackMapLivenessID);
     469        3912 :     disablePass(&FuncletLayoutID);
     470        1956 :   }
     471             : 
     472             :   AMDGPUTargetMachine &getAMDGPUTargetMachine() const {
     473        3419 :     return getTM<AMDGPUTargetMachine>();
     474             :   }
     475             : 
     476             :   ScheduleDAGInstrs *
     477           0 :   createMachineScheduler(MachineSchedContext *C) const override {
     478           0 :     ScheduleDAGMILive *DAG = createGenericSchedLive(C);
     479           0 :     DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
     480           0 :     DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
     481           0 :     return DAG;
     482             :   }
     483             : 
     484             :   void addEarlyCSEOrGVNPass();
     485             :   void addStraightLineScalarOptimizationPasses();
     486             :   void addIRPasses() override;
     487             :   void addCodeGenPrepare() override;
     488             :   bool addPreISel() override;
     489             :   bool addInstSelector() override;
     490             :   bool addGCPasses() override;
     491             : };
     492             : 
     493         494 : class R600PassConfig final : public AMDGPUPassConfig {
     494             : public:
     495             :   R600PassConfig(LLVMTargetMachine &TM, PassManagerBase &PM)
     496         248 :     : AMDGPUPassConfig(TM, PM) {}
     497             : 
     498        2057 :   ScheduleDAGInstrs *createMachineScheduler(
     499             :     MachineSchedContext *C) const override {
     500        2057 :     return createR600MachineScheduler(C);
     501             :   }
     502             : 
     503             :   bool addPreISel() override;
     504             :   bool addInstSelector() override;
     505             :   void addPreRegAlloc() override;
     506             :   void addPreSched2() override;
     507             :   void addPreEmitPass() override;
     508             : };
     509             : 
     510        3400 : class GCNPassConfig final : public AMDGPUPassConfig {
     511             : public:
     512        1708 :   GCNPassConfig(LLVMTargetMachine &TM, PassManagerBase &PM)
     513        1708 :     : AMDGPUPassConfig(TM, PM) {
     514             :     // It is necessary to know the register usage of the entire call graph.  We
     515             :     // allow calls without EnableAMDGPUFunctionCalls if they are marked
     516             :     // noinline, so this is always required.
     517        3416 :     setRequiresCodeGenSCCOrder(true);
     518        1708 :   }
     519             : 
     520             :   GCNTargetMachine &getGCNTargetMachine() const {
     521             :     return getTM<GCNTargetMachine>();
     522             :   }
     523             : 
     524             :   ScheduleDAGInstrs *
     525             :   createMachineScheduler(MachineSchedContext *C) const override;
     526             : 
     527             :   bool addPreISel() override;
     528             :   void addMachineSSAOptimization() override;
     529             :   bool addILPOpts() override;
     530             :   bool addInstSelector() override;
     531             :   bool addIRTranslator() override;
     532             :   bool addLegalizeMachineIR() override;
     533             :   bool addRegBankSelect() override;
     534             :   bool addGlobalInstructionSelect() override;
     535             :   void addFastRegAlloc(FunctionPass *RegAllocPass) override;
     536             :   void addOptimizedRegAlloc(FunctionPass *RegAllocPass) override;
     537             :   void addPreRegAlloc() override;
     538             :   void addPostRegAlloc() override;
     539             :   void addPreSched2() override;
     540             :   void addPreEmitPass() override;
     541             : };
     542             : 
     543             : } // end anonymous namespace
     544             : 
     545        2141 : TargetIRAnalysis AMDGPUTargetMachine::getTargetIRAnalysis() {
     546      285942 :   return TargetIRAnalysis([this](const Function &F) {
     547      571884 :     return TargetTransformInfo(AMDGPUTTIImpl(this, F));
     548      292365 :   });
     549             : }
     550             : 
     551        3334 : void AMDGPUPassConfig::addEarlyCSEOrGVNPass() {
     552        3334 :   if (getOptLevel() == CodeGenOpt::Aggressive)
     553           0 :     addPass(createGVNPass());
     554             :   else
     555        3334 :     addPass(createEarlyCSEPass());
     556        3334 : }
     557             : 
     558        1667 : void AMDGPUPassConfig::addStraightLineScalarOptimizationPasses() {
     559        1667 :   addPass(createSeparateConstOffsetFromGEPPass());
     560        1667 :   addPass(createSpeculativeExecutionPass());
     561             :   // ReassociateGEPs exposes more opportunites for SLSR. See
     562             :   // the example in reassociate-geps-and-slsr.ll.
     563        1667 :   addPass(createStraightLineStrengthReducePass());
     564             :   // SeparateConstOffsetFromGEP and SLSR creates common expressions which GVN or
     565             :   // EarlyCSE can reuse.
     566        1667 :   addEarlyCSEOrGVNPass();
     567             :   // Run NaryReassociate after EarlyCSE/GVN to be more effective.
     568        1667 :   addPass(createNaryReassociatePass());
     569             :   // NaryReassociate on GEPs creates redundant common expressions, so run
     570             :   // EarlyCSE after it.
     571        1667 :   addPass(createEarlyCSEPass());
     572        1667 : }
     573             : 
     574        1712 : void AMDGPUPassConfig::addIRPasses() {
     575        1712 :   const AMDGPUTargetMachine &TM = getAMDGPUTargetMachine();
     576             : 
     577             :   // There is no reason to run these.
     578        3424 :   disablePass(&StackMapLivenessID);
     579        3424 :   disablePass(&FuncletLayoutID);
     580        3424 :   disablePass(&PatchableFunctionID);
     581             : 
     582        1712 :   addPass(createAMDGPULowerIntrinsicsPass());
     583             : 
     584        3180 :   if (TM.getTargetTriple().getArch() == Triple::r600 ||
     585        1468 :       !EnableAMDGPUFunctionCalls) {
     586             :     // Function calls are not supported, so make sure we inline everything.
     587        1712 :     addPass(createAMDGPUAlwaysInlinePass());
     588        1712 :     addPass(createAlwaysInlinerLegacyPass());
     589             :     // We need to add the barrier noop pass, otherwise adding the function
     590             :     // inlining pass will cause all of the PassConfigs passes to be run
     591             :     // one function at a time, which means if we have a nodule with two
     592             :     // functions, then we will generate code for the first function
     593             :     // without ever running any passes on the second.
     594        1712 :     addPass(createBarrierNoopPass());
     595             :   }
     596             : 
     597        1712 :   if (TM.getTargetTriple().getArch() == Triple::amdgcn) {
     598             :     // TODO: May want to move later or split into an early and late one.
     599             : 
     600        1468 :     addPass(createAMDGPUCodeGenPreparePass());
     601             :   }
     602             : 
     603             :   // Handle uses of OpenCL image2d_t, image3d_t and sampler_t arguments.
     604        1712 :   addPass(createAMDGPUOpenCLImageTypeLoweringPass());
     605             : 
     606        1712 :   if (TM.getOptLevel() > CodeGenOpt::None) {
     607        1667 :     addPass(createInferAddressSpacesPass());
     608        1667 :     addPass(createAMDGPUPromoteAlloca());
     609             : 
     610        1667 :     if (EnableSROA)
     611        1651 :       addPass(createSROAPass());
     612             : 
     613        1667 :     addStraightLineScalarOptimizationPasses();
     614             : 
     615        1667 :     if (EnableAMDGPUAliasAnalysis) {
     616        1655 :       addPass(createAMDGPUAAWrapperPass());
     617        4965 :       addPass(createExternalAAWrapperPass([](Pass &P, Function &,
     618       77410 :                                              AAResults &AAR) {
     619       77410 :         if (auto *WrapperPass = P.getAnalysisIfAvailable<AMDGPUAAWrapperPass>())
     620       77410 :           AAR.addAAResult(WrapperPass->getResult());
     621       77410 :         }));
     622             :     }
     623             :   }
     624             : 
     625        1712 :   TargetPassConfig::addIRPasses();
     626             : 
     627             :   // EarlyCSE is not always strong enough to clean up what LSR produces. For
     628             :   // example, GVN can combine
     629             :   //
     630             :   //   %0 = add %a, %b
     631             :   //   %1 = add %b, %a
     632             :   //
     633             :   // and
     634             :   //
     635             :   //   %0 = shl nsw %a, 2
     636             :   //   %1 = shl %a, 2
     637             :   //
     638             :   // but EarlyCSE can do neither of them.
     639        1712 :   if (getOptLevel() != CodeGenOpt::None)
     640        1667 :     addEarlyCSEOrGVNPass();
     641        1712 : }
     642             : 
     643        1712 : void AMDGPUPassConfig::addCodeGenPrepare() {
     644        1712 :   TargetPassConfig::addCodeGenPrepare();
     645             : 
     646        1712 :   if (EnableLoadStoreVectorizer)
     647        1703 :     addPass(createLoadStoreVectorizerPass());
     648        1712 : }
     649             : 
     650        1712 : bool AMDGPUPassConfig::addPreISel() {
     651        1712 :   addPass(createFlattenCFGPass());
     652        1712 :   return false;
     653             : }
     654             : 
     655        1463 : bool AMDGPUPassConfig::addInstSelector() {
     656        2926 :   addPass(createAMDGPUISelDag(&getAMDGPUTargetMachine(), getOptLevel()));
     657        1463 :   return false;
     658             : }
     659             : 
     660        1712 : bool AMDGPUPassConfig::addGCPasses() {
     661             :   // Do nothing. GC is not supported.
     662        1712 :   return false;
     663             : }
     664             : 
     665             : //===----------------------------------------------------------------------===//
     666             : // R600 Pass Setup
     667             : //===----------------------------------------------------------------------===//
     668             : 
     669         244 : bool R600PassConfig::addPreISel() {
     670         244 :   AMDGPUPassConfig::addPreISel();
     671             : 
     672         244 :   if (EnableR600StructurizeCFG)
     673         242 :     addPass(createStructurizeCFGPass());
     674         244 :   return false;
     675             : }
     676             : 
     677         244 : bool R600PassConfig::addInstSelector() {
     678         488 :   addPass(createR600ISelDag(&getAMDGPUTargetMachine(), getOptLevel()));
     679         244 :   return false;
     680             : }
     681             : 
     682         244 : void R600PassConfig::addPreRegAlloc() {
     683         244 :   addPass(createR600VectorRegMerger());
     684         244 : }
     685             : 
     686         244 : void R600PassConfig::addPreSched2() {
     687         244 :   addPass(createR600EmitClauseMarkers(), false);
     688         244 :   if (EnableR600IfConvert)
     689         243 :     addPass(&IfConverterID, false);
     690         244 :   addPass(createR600ClauseMergePass(), false);
     691         244 : }
     692             : 
     693         244 : void R600PassConfig::addPreEmitPass() {
     694         244 :   addPass(createAMDGPUCFGStructurizerPass(), false);
     695         244 :   addPass(createR600ExpandSpecialInstrsPass(), false);
     696         244 :   addPass(&FinalizeMachineBundlesID, false);
     697         244 :   addPass(createR600Packetizer(), false);
     698         244 :   addPass(createR600ControlFlowFinalizer(), false);
     699         244 : }
     700             : 
     701         248 : TargetPassConfig *R600TargetMachine::createPassConfig(PassManagerBase &PM) {
     702         496 :   return new R600PassConfig(*this, PM);
     703             : }
     704             : 
     705             : //===----------------------------------------------------------------------===//
     706             : // GCN Pass Setup
     707             : //===----------------------------------------------------------------------===//
     708             : 
     709       14647 : ScheduleDAGInstrs *GCNPassConfig::createMachineScheduler(
     710             :   MachineSchedContext *C) const {
     711       14647 :   const SISubtarget &ST = C->MF->getSubtarget<SISubtarget>();
     712       14647 :   if (ST.enableSIScheduler())
     713           0 :     return createSIMachineScheduler(C);
     714       14647 :   return createGCNMaxOccupancyMachineScheduler(C);
     715             : }
     716             : 
     717        1468 : bool GCNPassConfig::addPreISel() {
     718        1468 :   AMDGPUPassConfig::addPreISel();
     719             : 
     720             :   // FIXME: We need to run a pass to propagate the attributes when calls are
     721             :   // supported.
     722        1468 :   addPass(createAMDGPUAnnotateKernelFeaturesPass());
     723             : 
     724             :   // Merge divergent exit nodes. StructurizeCFG won't recognize the multi-exit
     725             :   // regions formed by them.
     726        1468 :   addPass(&AMDGPUUnifyDivergentExitNodesID);
     727        1468 :   if (!LateCFGStructurize) {
     728        1468 :     addPass(createStructurizeCFGPass(true)); // true -> SkipUniformRegions
     729             :   }
     730        1468 :   addPass(createSinkingPass());
     731        1468 :   addPass(createAMDGPUAnnotateUniformValues());
     732        1468 :   if (!LateCFGStructurize) {
     733        1468 :     addPass(createSIAnnotateControlFlowPass());
     734             :   }
     735             : 
     736        1468 :   return false;
     737             : }
     738             : 
     739        1424 : void GCNPassConfig::addMachineSSAOptimization() {
     740        1424 :   TargetPassConfig::addMachineSSAOptimization();
     741             : 
     742             :   // We want to fold operands after PeepholeOptimizer has run (or as part of
     743             :   // it), because it will eliminate extra copies making it easier to fold the
     744             :   // real source operand. We want to eliminate dead instructions after, so that
     745             :   // we see fewer uses of the copies. We then need to clean up the dead
     746             :   // instructions leftover after the operands are folded as well.
     747             :   //
     748             :   // XXX - Can we get away without running DeadMachineInstructionElim again?
     749        1424 :   addPass(&SIFoldOperandsID);
     750        1424 :   addPass(&DeadMachineInstructionElimID);
     751        1424 :   addPass(&SILoadStoreOptimizerID);
     752        1424 :   if (EnableSDWAPeephole) {
     753        1420 :     addPass(&SIPeepholeSDWAID);
     754        1420 :     addPass(&MachineLICMID);
     755        1420 :     addPass(&MachineCSEID);
     756        1420 :     addPass(&SIFoldOperandsID);
     757        1420 :     addPass(&DeadMachineInstructionElimID);
     758             :   }
     759        1424 :   addPass(createSIShrinkInstructionsPass());
     760        1424 : }
     761             : 
     762        1424 : bool GCNPassConfig::addILPOpts() {
     763        1424 :   if (EnableEarlyIfConversion)
     764           2 :     addPass(&EarlyIfConverterID);
     765             : 
     766        1424 :   TargetPassConfig::addILPOpts();
     767        1424 :   return false;
     768             : }
     769             : 
     770        1463 : bool GCNPassConfig::addInstSelector() {
     771        1463 :   AMDGPUPassConfig::addInstSelector();
     772        1463 :   addPass(createSILowerI1CopiesPass());
     773        1463 :   addPass(&SIFixSGPRCopiesID);
     774        1463 :   return false;
     775             : }
     776             : 
     777           5 : bool GCNPassConfig::addIRTranslator() {
     778           5 :   addPass(new IRTranslator());
     779           5 :   return false;
     780             : }
     781             : 
     782           5 : bool GCNPassConfig::addLegalizeMachineIR() {
     783           5 :   addPass(new Legalizer());
     784           5 :   return false;
     785             : }
     786             : 
     787           5 : bool GCNPassConfig::addRegBankSelect() {
     788           5 :   addPass(new RegBankSelect());
     789           5 :   return false;
     790             : }
     791             : 
     792           5 : bool GCNPassConfig::addGlobalInstructionSelect() {
     793           5 :   addPass(new InstructionSelect());
     794           5 :   return false;
     795             : }
     796             : 
     797        1468 : void GCNPassConfig::addPreRegAlloc() {
     798        1468 :   if (LateCFGStructurize) {
     799           0 :     addPass(createAMDGPUMachineCFGStructurizerPass());
     800             :   }
     801        1468 :   addPass(createSIWholeQuadModePass());
     802        1468 : }
     803             : 
     804          44 : void GCNPassConfig::addFastRegAlloc(FunctionPass *RegAllocPass) {
     805             :   // FIXME: We have to disable the verifier here because of PHIElimination +
     806             :   // TwoAddressInstructions disabling it.
     807             : 
     808             :   // This must be run immediately after phi elimination and before
     809             :   // TwoAddressInstructions, otherwise the processing of the tied operand of
     810             :   // SI_ELSE will introduce a copy of the tied operand source after the else.
     811          88 :   insertPass(&PHIEliminationID, &SILowerControlFlowID, false);
     812             : 
     813             :   // This must be run after SILowerControlFlow, since it needs to use the
     814             :   // machine-level CFG, but before register allocation.
     815          88 :   insertPass(&SILowerControlFlowID, &SIFixWWMLivenessID, false);
     816             : 
     817          44 :   TargetPassConfig::addFastRegAlloc(RegAllocPass);
     818          44 : }
     819             : 
     820        1424 : void GCNPassConfig::addOptimizedRegAlloc(FunctionPass *RegAllocPass) {
     821        2848 :   insertPass(&MachineSchedulerID, &SIOptimizeExecMaskingPreRAID);
     822             : 
     823             :   // This must be run immediately after phi elimination and before
     824             :   // TwoAddressInstructions, otherwise the processing of the tied operand of
     825             :   // SI_ELSE will introduce a copy of the tied operand source after the else.
     826        2848 :   insertPass(&PHIEliminationID, &SILowerControlFlowID, false);
     827             : 
     828             :   // This must be run after SILowerControlFlow, since it needs to use the
     829             :   // machine-level CFG, but before register allocation.
     830        2848 :   insertPass(&SILowerControlFlowID, &SIFixWWMLivenessID, false);
     831             : 
     832        1424 :   TargetPassConfig::addOptimizedRegAlloc(RegAllocPass);
     833        1424 : }
     834             : 
     835        1468 : void GCNPassConfig::addPostRegAlloc() {
     836        1468 :   addPass(&SIFixVGPRCopiesID);
     837        1468 :   addPass(&SIOptimizeExecMaskingID);
     838        1468 :   TargetPassConfig::addPostRegAlloc();
     839        1468 : }
     840             : 
     841        1468 : void GCNPassConfig::addPreSched2() {
     842        1468 : }
     843             : 
     844        1468 : void GCNPassConfig::addPreEmitPass() {
     845             :   // The hazard recognizer that runs as part of the post-ra scheduler does not
     846             :   // guarantee to be able handle all hazards correctly. This is because if there
     847             :   // are multiple scheduling regions in a basic block, the regions are scheduled
     848             :   // bottom up, so when we begin to schedule a region we don't know what
     849             :   // instructions were emitted directly before it.
     850             :   //
     851             :   // Here we add a stand-alone hazard recognizer pass which can handle all
     852             :   // cases.
     853        1468 :   addPass(&PostRAHazardRecognizerID);
     854             : 
     855        1468 :   if (EnableSIInsertWaitcntsPass)
     856        1468 :     addPass(createSIInsertWaitcntsPass());
     857             :   else
     858           0 :     addPass(createSIInsertWaitsPass());
     859        1468 :   addPass(createSIShrinkInstructionsPass());
     860        1468 :   addPass(&SIInsertSkipsPassID);
     861        1468 :   addPass(createSIMemoryLegalizerPass());
     862        1468 :   addPass(createSIDebuggerInsertNopsPass());
     863        1468 :   addPass(&BranchRelaxationPassID);
     864        1468 : }
     865             : 
     866        1708 : TargetPassConfig *GCNTargetMachine::createPassConfig(PassManagerBase &PM) {
     867        1708 :   return new GCNPassConfig(*this, PM);
     868      216918 : }
     869             : 

Generated by: LCOV version 1.13