LCOV - code coverage report
Current view: top level - lib/Target/AMDGPU - AMDGPUTargetMachine.cpp (source / functions) Hit Total Coverage
Test: llvm-toolchain.info Lines: 314 326 96.3 %
Date: 2018-10-20 13:21:21 Functions: 51 54 94.4 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : //===-- AMDGPUTargetMachine.cpp - TargetMachine for hw codegen targets-----===//
       2             : //
       3             : //                     The LLVM Compiler Infrastructure
       4             : //
       5             : // This file is distributed under the University of Illinois Open Source
       6             : // License. See LICENSE.TXT for details.
       7             : //
       8             : //===----------------------------------------------------------------------===//
       9             : //
      10             : /// \file
      11             : /// The AMDGPU target machine contains all of the hardware specific
      12             : /// information  needed to emit code for R600 and SI GPUs.
      13             : //
      14             : //===----------------------------------------------------------------------===//
      15             : 
      16             : #include "AMDGPUTargetMachine.h"
      17             : #include "AMDGPU.h"
      18             : #include "AMDGPUAliasAnalysis.h"
      19             : #include "AMDGPUCallLowering.h"
      20             : #include "AMDGPUInstructionSelector.h"
      21             : #include "AMDGPULegalizerInfo.h"
      22             : #include "AMDGPUMacroFusion.h"
      23             : #include "AMDGPUTargetObjectFile.h"
      24             : #include "AMDGPUTargetTransformInfo.h"
      25             : #include "GCNIterativeScheduler.h"
      26             : #include "GCNSchedStrategy.h"
      27             : #include "R600MachineScheduler.h"
      28             : #include "SIMachineScheduler.h"
      29             : #include "llvm/CodeGen/GlobalISel/IRTranslator.h"
      30             : #include "llvm/CodeGen/GlobalISel/InstructionSelect.h"
      31             : #include "llvm/CodeGen/GlobalISel/Legalizer.h"
      32             : #include "llvm/CodeGen/GlobalISel/RegBankSelect.h"
      33             : #include "llvm/CodeGen/Passes.h"
      34             : #include "llvm/CodeGen/TargetPassConfig.h"
      35             : #include "llvm/IR/Attributes.h"
      36             : #include "llvm/IR/Function.h"
      37             : #include "llvm/IR/LegacyPassManager.h"
      38             : #include "llvm/Pass.h"
      39             : #include "llvm/Support/CommandLine.h"
      40             : #include "llvm/Support/Compiler.h"
      41             : #include "llvm/Support/TargetRegistry.h"
      42             : #include "llvm/Target/TargetLoweringObjectFile.h"
      43             : #include "llvm/Transforms/IPO.h"
      44             : #include "llvm/Transforms/IPO/AlwaysInliner.h"
      45             : #include "llvm/Transforms/IPO/PassManagerBuilder.h"
      46             : #include "llvm/Transforms/Scalar.h"
      47             : #include "llvm/Transforms/Scalar/GVN.h"
      48             : #include "llvm/Transforms/Utils.h"
      49             : #include "llvm/Transforms/Vectorize.h"
      50             : #include <memory>
      51             : 
      52             : using namespace llvm;
      53             : 
      54             : static cl::opt<bool> EnableR600StructurizeCFG(
      55             :   "r600-ir-structurize",
      56             :   cl::desc("Use StructurizeCFG IR pass"),
      57             :   cl::init(true));
      58             : 
      59             : static cl::opt<bool> EnableSROA(
      60             :   "amdgpu-sroa",
      61             :   cl::desc("Run SROA after promote alloca pass"),
      62             :   cl::ReallyHidden,
      63             :   cl::init(true));
      64             : 
      65             : static cl::opt<bool>
      66             : EnableEarlyIfConversion("amdgpu-early-ifcvt", cl::Hidden,
      67             :                         cl::desc("Run early if-conversion"),
      68             :                         cl::init(false));
      69             : 
      70             : static cl::opt<bool> EnableR600IfConvert(
      71             :   "r600-if-convert",
      72             :   cl::desc("Use if conversion pass"),
      73             :   cl::ReallyHidden,
      74             :   cl::init(true));
      75             : 
      76             : // Option to disable vectorizer for tests.
      77             : static cl::opt<bool> EnableLoadStoreVectorizer(
      78             :   "amdgpu-load-store-vectorizer",
      79             :   cl::desc("Enable load store vectorizer"),
      80             :   cl::init(true),
      81             :   cl::Hidden);
      82             : 
      83             : // Option to control global loads scalarization
      84             : static cl::opt<bool> ScalarizeGlobal(
      85             :   "amdgpu-scalarize-global-loads",
      86             :   cl::desc("Enable global load scalarization"),
      87             :   cl::init(true),
      88             :   cl::Hidden);
      89             : 
      90             : // Option to run internalize pass.
      91             : static cl::opt<bool> InternalizeSymbols(
      92             :   "amdgpu-internalize-symbols",
      93             :   cl::desc("Enable elimination of non-kernel functions and unused globals"),
      94             :   cl::init(false),
      95             :   cl::Hidden);
      96             : 
      97             : // Option to inline all early.
      98             : static cl::opt<bool> EarlyInlineAll(
      99             :   "amdgpu-early-inline-all",
     100             :   cl::desc("Inline all functions early"),
     101             :   cl::init(false),
     102             :   cl::Hidden);
     103             : 
     104             : static cl::opt<bool> EnableSDWAPeephole(
     105             :   "amdgpu-sdwa-peephole",
     106             :   cl::desc("Enable SDWA peepholer"),
     107             :   cl::init(true));
     108             : 
     109             : // Enable address space based alias analysis
     110             : static cl::opt<bool> EnableAMDGPUAliasAnalysis("enable-amdgpu-aa", cl::Hidden,
     111             :   cl::desc("Enable AMDGPU Alias Analysis"),
     112             :   cl::init(true));
     113             : 
     114             : // Option to run late CFG structurizer
     115             : static cl::opt<bool, true> LateCFGStructurize(
     116             :   "amdgpu-late-structurize",
     117             :   cl::desc("Enable late CFG structurization"),
     118             :   cl::location(AMDGPUTargetMachine::EnableLateStructurizeCFG),
     119             :   cl::Hidden);
     120             : 
     121             : static cl::opt<bool, true> EnableAMDGPUFunctionCalls(
     122             :   "amdgpu-function-calls",
     123             :   cl::desc("Enable AMDGPU function call support"),
     124             :   cl::location(AMDGPUTargetMachine::EnableFunctionCalls),
     125             :   cl::init(false),
     126             :   cl::Hidden);
     127             : 
     128             : // Enable lib calls simplifications
     129             : static cl::opt<bool> EnableLibCallSimplify(
     130             :   "amdgpu-simplify-libcall",
     131             :   cl::desc("Enable amdgpu library simplifications"),
     132             :   cl::init(true),
     133             :   cl::Hidden);
     134             : 
     135             : static cl::opt<bool> EnableLowerKernelArguments(
     136             :   "amdgpu-ir-lower-kernel-arguments",
     137             :   cl::desc("Lower kernel argument loads in IR pass"),
     138             :   cl::init(true),
     139             :   cl::Hidden);
     140             : 
     141             : // Enable atomic optimization
     142             : static cl::opt<bool> EnableAtomicOptimizations(
     143             :   "amdgpu-atomic-optimizations",
     144             :   cl::desc("Enable atomic optimizations"),
     145             :   cl::init(false),
     146             :   cl::Hidden);
     147             : 
     148      113919 : extern "C" void LLVMInitializeAMDGPUTarget() {
     149             :   // Register the target
     150      113919 :   RegisterTargetMachine<R600TargetMachine> X(getTheAMDGPUTarget());
     151      113919 :   RegisterTargetMachine<GCNTargetMachine> Y(getTheGCNTarget());
     152             : 
     153      113919 :   PassRegistry *PR = PassRegistry::getPassRegistry();
     154      113919 :   initializeR600ClauseMergePassPass(*PR);
     155      113919 :   initializeR600ControlFlowFinalizerPass(*PR);
     156      113919 :   initializeR600PacketizerPass(*PR);
     157      113919 :   initializeR600ExpandSpecialInstrsPassPass(*PR);
     158      113919 :   initializeR600VectorRegMergerPass(*PR);
     159      113919 :   initializeGlobalISel(*PR);
     160      113919 :   initializeAMDGPUDAGToDAGISelPass(*PR);
     161      113919 :   initializeSILowerI1CopiesPass(*PR);
     162      113919 :   initializeSIFixSGPRCopiesPass(*PR);
     163      113919 :   initializeSIFixVGPRCopiesPass(*PR);
     164      113919 :   initializeSIFoldOperandsPass(*PR);
     165      113919 :   initializeSIPeepholeSDWAPass(*PR);
     166      113919 :   initializeSIShrinkInstructionsPass(*PR);
     167      113919 :   initializeSIOptimizeExecMaskingPreRAPass(*PR);
     168      113919 :   initializeSILoadStoreOptimizerPass(*PR);
     169      113919 :   initializeAMDGPUAlwaysInlinePass(*PR);
     170      113919 :   initializeAMDGPUAnnotateKernelFeaturesPass(*PR);
     171      113919 :   initializeAMDGPUAnnotateUniformValuesPass(*PR);
     172      113919 :   initializeAMDGPUArgumentUsageInfoPass(*PR);
     173      113919 :   initializeAMDGPUAtomicOptimizerPass(*PR);
     174      113919 :   initializeAMDGPULowerKernelArgumentsPass(*PR);
     175      113919 :   initializeAMDGPULowerKernelAttributesPass(*PR);
     176      113919 :   initializeAMDGPULowerIntrinsicsPass(*PR);
     177      113919 :   initializeAMDGPUOpenCLEnqueuedBlockLoweringPass(*PR);
     178      113919 :   initializeAMDGPUPromoteAllocaPass(*PR);
     179      113919 :   initializeAMDGPUCodeGenPreparePass(*PR);
     180      113919 :   initializeAMDGPURewriteOutArgumentsPass(*PR);
     181      113919 :   initializeAMDGPUUnifyMetadataPass(*PR);
     182      113919 :   initializeSIAnnotateControlFlowPass(*PR);
     183      113919 :   initializeSIInsertWaitcntsPass(*PR);
     184      113919 :   initializeSIWholeQuadModePass(*PR);
     185      113919 :   initializeSILowerControlFlowPass(*PR);
     186      113919 :   initializeSIInsertSkipsPass(*PR);
     187      113919 :   initializeSIMemoryLegalizerPass(*PR);
     188      113919 :   initializeSIDebuggerInsertNopsPass(*PR);
     189      113919 :   initializeSIOptimizeExecMaskingPass(*PR);
     190      113919 :   initializeSIFixWWMLivenessPass(*PR);
     191      113919 :   initializeSIFormMemoryClausesPass(*PR);
     192      113919 :   initializeAMDGPUUnifyDivergentExitNodesPass(*PR);
     193      113919 :   initializeAMDGPUAAWrapperPassPass(*PR);
     194      113919 :   initializeAMDGPUUseNativeCallsPass(*PR);
     195      113919 :   initializeAMDGPUSimplifyLibCallsPass(*PR);
     196      113919 :   initializeAMDGPUInlinerPass(*PR);
     197      113919 : }
     198             : 
     199           0 : static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) {
     200           0 :   return llvm::make_unique<AMDGPUTargetObjectFile>();
     201             : }
     202             : 
     203        2297 : static ScheduleDAGInstrs *createR600MachineScheduler(MachineSchedContext *C) {
     204        4594 :   return new ScheduleDAGMILive(C, llvm::make_unique<R600SchedStrategy>());
     205             : }
     206             : 
     207           1 : static ScheduleDAGInstrs *createSIMachineScheduler(MachineSchedContext *C) {
     208           1 :   return new SIScheduleDAGMI(C);
     209             : }
     210             : 
     211             : static ScheduleDAGInstrs *
     212       19515 : createGCNMaxOccupancyMachineScheduler(MachineSchedContext *C) {
     213             :   ScheduleDAGMILive *DAG =
     214       19515 :     new GCNScheduleDAGMILive(C, make_unique<GCNMaxOccupancySchedStrategy>(C));
     215       19515 :   DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
     216       19515 :   DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
     217       19515 :   DAG->addMutation(createAMDGPUMacroFusionDAGMutation());
     218       19515 :   return DAG;
     219             : }
     220             : 
     221             : static ScheduleDAGInstrs *
     222           3 : createIterativeGCNMaxOccupancyMachineScheduler(MachineSchedContext *C) {
     223             :   auto DAG = new GCNIterativeScheduler(C,
     224           3 :     GCNIterativeScheduler::SCHEDULE_LEGACYMAXOCCUPANCY);
     225           3 :   DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
     226           3 :   DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
     227           3 :   return DAG;
     228             : }
     229             : 
     230           3 : static ScheduleDAGInstrs *createMinRegScheduler(MachineSchedContext *C) {
     231             :   return new GCNIterativeScheduler(C,
     232           3 :     GCNIterativeScheduler::SCHEDULE_MINREGFORCED);
     233             : }
     234             : 
     235             : static ScheduleDAGInstrs *
     236           2 : createIterativeILPMachineScheduler(MachineSchedContext *C) {
     237             :   auto DAG = new GCNIterativeScheduler(C,
     238           2 :     GCNIterativeScheduler::SCHEDULE_ILP);
     239           2 :   DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
     240           2 :   DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
     241           2 :   DAG->addMutation(createAMDGPUMacroFusionDAGMutation());
     242           2 :   return DAG;
     243             : }
     244             : 
     245             : static MachineSchedRegistry
     246             : R600SchedRegistry("r600", "Run R600's custom scheduler",
     247             :                    createR600MachineScheduler);
     248             : 
     249             : static MachineSchedRegistry
     250             : SISchedRegistry("si", "Run SI's custom scheduler",
     251             :                 createSIMachineScheduler);
     252             : 
     253             : static MachineSchedRegistry
     254             : GCNMaxOccupancySchedRegistry("gcn-max-occupancy",
     255             :                              "Run GCN scheduler to maximize occupancy",
     256             :                              createGCNMaxOccupancyMachineScheduler);
     257             : 
     258             : static MachineSchedRegistry
     259             : IterativeGCNMaxOccupancySchedRegistry("gcn-max-occupancy-experimental",
     260             :   "Run GCN scheduler to maximize occupancy (experimental)",
     261             :   createIterativeGCNMaxOccupancyMachineScheduler);
     262             : 
     263             : static MachineSchedRegistry
     264             : GCNMinRegSchedRegistry("gcn-minreg",
     265             :   "Run GCN iterative scheduler for minimal register usage (experimental)",
     266             :   createMinRegScheduler);
     267             : 
     268             : static MachineSchedRegistry
     269             : GCNILPSchedRegistry("gcn-ilp",
     270             :   "Run GCN iterative scheduler for ILP scheduling (experimental)",
     271             :   createIterativeILPMachineScheduler);
     272             : 
     273             : static StringRef computeDataLayout(const Triple &TT) {
     274        2807 :   if (TT.getArch() == Triple::r600) {
     275             :     // 32-bit pointers.
     276             :       return "e-p:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128"
     277             :              "-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5";
     278             :   }
     279             : 
     280             :   // 32-bit private, local, and region pointers. 64-bit global, constant and
     281             :   // flat.
     282             :     return "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32"
     283             :          "-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128"
     284             :          "-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5";
     285             : }
     286             : 
     287             : LLVM_READNONE
     288             : static StringRef getGPUOrDefault(const Triple &TT, StringRef GPU) {
     289        2807 :   if (!GPU.empty())
     290             :     return GPU;
     291             : 
     292         772 :   if (TT.getArch() == Triple::amdgcn)
     293             :     return "generic";
     294             : 
     295             :   return "r600";
     296             : }
     297             : 
     298           0 : static Reloc::Model getEffectiveRelocModel(Optional<Reloc::Model> RM) {
     299             :   // The AMDGPU toolchain only supports generating shared objects, so we
     300             :   // must always use PIC.
     301           0 :   return Reloc::PIC_;
     302             : }
     303             : 
     304             : static CodeModel::Model getEffectiveCodeModel(Optional<CodeModel::Model> CM) {
     305        2807 :   if (CM)
     306             :     return *CM;
     307             :   return CodeModel::Small;
     308             : }
     309             : 
     310        2807 : AMDGPUTargetMachine::AMDGPUTargetMachine(const Target &T, const Triple &TT,
     311             :                                          StringRef CPU, StringRef FS,
     312             :                                          TargetOptions Options,
     313             :                                          Optional<Reloc::Model> RM,
     314             :                                          Optional<CodeModel::Model> CM,
     315        2807 :                                          CodeGenOpt::Level OptLevel)
     316             :     : LLVMTargetMachine(T, computeDataLayout(TT), TT, getGPUOrDefault(TT, CPU),
     317             :                         FS, Options, getEffectiveRelocModel(RM),
     318             :                         getEffectiveCodeModel(CM), OptLevel),
     319        5614 :       TLOF(createTLOF(getTargetTriple())) {
     320        2807 :   initAsmInfo();
     321        2807 : }
     322             : 
     323             : bool AMDGPUTargetMachine::EnableLateStructurizeCFG = false;
     324             : bool AMDGPUTargetMachine::EnableFunctionCalls = false;
     325             : 
     326             : AMDGPUTargetMachine::~AMDGPUTargetMachine() = default;
     327             : 
     328      690446 : StringRef AMDGPUTargetMachine::getGPUName(const Function &F) const {
     329      690446 :   Attribute GPUAttr = F.getFnAttribute("target-cpu");
     330      690446 :   return GPUAttr.hasAttribute(Attribute::None) ?
     331      690446 :     getTargetCPU() : GPUAttr.getValueAsString();
     332             : }
     333             : 
     334      690446 : StringRef AMDGPUTargetMachine::getFeatureString(const Function &F) const {
     335      690446 :   Attribute FSAttr = F.getFnAttribute("target-features");
     336             : 
     337      690446 :   return FSAttr.hasAttribute(Attribute::None) ?
     338             :     getTargetFeatureString() :
     339      690446 :     FSAttr.getValueAsString();
     340             : }
     341             : 
     342         146 : static ImmutablePass *createAMDGPUExternalAAWrapperPass() {
     343         146 :   return createExternalAAWrapperPass([](Pass &P, Function &, AAResults &AAR) {
     344             :       if (auto *WrapperPass = P.getAnalysisIfAvailable<AMDGPUAAWrapperPass>())
     345             :         AAR.addAAResult(WrapperPass->getResult());
     346         146 :       });
     347             : }
     348             : 
     349             : /// Predicate for Internalize pass.
     350           7 : static bool mustPreserveGV(const GlobalValue &GV) {
     351             :   if (const Function *F = dyn_cast<Function>(&GV))
     352           5 :     return F->isDeclaration() || AMDGPU::isEntryFunctionCC(F->getCallingConv());
     353             : 
     354           2 :   return !GV.use_empty();
     355             : }
     356             : 
     357         130 : void AMDGPUTargetMachine::adjustPassManager(PassManagerBuilder &Builder) {
     358         130 :   Builder.DivergentTarget = true;
     359             : 
     360         130 :   bool EnableOpt = getOptLevel() > CodeGenOpt::None;
     361             :   bool Internalize = InternalizeSymbols;
     362         130 :   bool EarlyInline = EarlyInlineAll && EnableOpt && !EnableAMDGPUFunctionCalls;
     363         130 :   bool AMDGPUAA = EnableAMDGPUAliasAnalysis && EnableOpt;
     364         130 :   bool LibCallSimplify = EnableLibCallSimplify && EnableOpt;
     365             : 
     366         130 :   if (EnableAMDGPUFunctionCalls) {
     367           2 :     delete Builder.Inliner;
     368           2 :     Builder.Inliner = createAMDGPUFunctionInliningPass();
     369             :   }
     370             : 
     371         260 :   Builder.addExtension(
     372             :     PassManagerBuilder::EP_ModuleOptimizerEarly,
     373             :     [Internalize, EarlyInline, AMDGPUAA](const PassManagerBuilder &,
     374             :                                          legacy::PassManagerBase &PM) {
     375             :       if (AMDGPUAA) {
     376             :         PM.add(createAMDGPUAAWrapperPass());
     377             :         PM.add(createAMDGPUExternalAAWrapperPass());
     378             :       }
     379             :       PM.add(createAMDGPUUnifyMetadataPass());
     380             :       if (Internalize) {
     381             :         PM.add(createInternalizePass(mustPreserveGV));
     382             :         PM.add(createGlobalDCEPass());
     383             :       }
     384             :       if (EarlyInline)
     385             :         PM.add(createAMDGPUAlwaysInlinePass(false));
     386             :   });
     387             : 
     388         130 :   const auto &Opt = Options;
     389         390 :   Builder.addExtension(
     390             :     PassManagerBuilder::EP_EarlyAsPossible,
     391             :     [AMDGPUAA, LibCallSimplify, &Opt](const PassManagerBuilder &,
     392             :                                       legacy::PassManagerBase &PM) {
     393             :       if (AMDGPUAA) {
     394             :         PM.add(createAMDGPUAAWrapperPass());
     395             :         PM.add(createAMDGPUExternalAAWrapperPass());
     396             :       }
     397             :       PM.add(llvm::createAMDGPUUseNativeCallsPass());
     398             :       if (LibCallSimplify)
     399             :         PM.add(llvm::createAMDGPUSimplifyLibCallsPass(Opt));
     400             :   });
     401             : 
     402         130 :   Builder.addExtension(
     403             :     PassManagerBuilder::EP_CGSCCOptimizerLate,
     404             :     [](const PassManagerBuilder &, legacy::PassManagerBase &PM) {
     405             :       // Add infer address spaces pass to the opt pipeline after inlining
     406             :       // but before SROA to increase SROA opportunities.
     407             :       PM.add(createInferAddressSpacesPass());
     408             : 
     409             :       // This should run after inlining to have any chance of doing anything,
     410             :       // and before other cleanup optimizations.
     411             :       PM.add(createAMDGPULowerKernelAttributesPass());
     412             :   });
     413         130 : }
     414             : 
     415             : //===----------------------------------------------------------------------===//
     416             : // R600 Target Machine (R600 -> Cayman)
     417             : //===----------------------------------------------------------------------===//
     418             : 
     419         295 : R600TargetMachine::R600TargetMachine(const Target &T, const Triple &TT,
     420             :                                      StringRef CPU, StringRef FS,
     421             :                                      TargetOptions Options,
     422             :                                      Optional<Reloc::Model> RM,
     423             :                                      Optional<CodeModel::Model> CM,
     424         295 :                                      CodeGenOpt::Level OL, bool JIT)
     425         590 :     : AMDGPUTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL) {
     426             :   setRequiresStructuredCFG(true);
     427         295 : }
     428             : 
     429       59390 : const R600Subtarget *R600TargetMachine::getSubtargetImpl(
     430             :   const Function &F) const {
     431       59390 :   StringRef GPU = getGPUName(F);
     432       59390 :   StringRef FS = getFeatureString(F);
     433             : 
     434             :   SmallString<128> SubtargetKey(GPU);
     435             :   SubtargetKey.append(FS);
     436             : 
     437       59390 :   auto &I = SubtargetMap[SubtargetKey];
     438       59390 :   if (!I) {
     439             :     // This needs to be done before we create a new subtarget since any
     440             :     // creation will depend on the TM and the code generation flags on the
     441             :     // function that reside in TargetOptions.
     442         291 :     resetTargetOptions(F);
     443         291 :     I = llvm::make_unique<R600Subtarget>(TargetTriple, GPU, FS, *this);
     444             :   }
     445             : 
     446       59390 :   return I.get();
     447             : }
     448             : 
     449             : TargetTransformInfo
     450       38193 : R600TargetMachine::getTargetTransformInfo(const Function &F) {
     451       38193 :   return TargetTransformInfo(R600TTIImpl(this, F));
     452             : }
     453             : 
     454             : //===----------------------------------------------------------------------===//
     455             : // GCN Target Machine (SI+)
     456             : //===----------------------------------------------------------------------===//
     457             : 
     458        2512 : GCNTargetMachine::GCNTargetMachine(const Target &T, const Triple &TT,
     459             :                                    StringRef CPU, StringRef FS,
     460             :                                    TargetOptions Options,
     461             :                                    Optional<Reloc::Model> RM,
     462             :                                    Optional<CodeModel::Model> CM,
     463        2512 :                                    CodeGenOpt::Level OL, bool JIT)
     464        7536 :     : AMDGPUTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL) {}
     465             : 
     466      631056 : const GCNSubtarget *GCNTargetMachine::getSubtargetImpl(const Function &F) const {
     467      631056 :   StringRef GPU = getGPUName(F);
     468      631056 :   StringRef FS = getFeatureString(F);
     469             : 
     470             :   SmallString<128> SubtargetKey(GPU);
     471             :   SubtargetKey.append(FS);
     472             : 
     473      631056 :   auto &I = SubtargetMap[SubtargetKey];
     474      631056 :   if (!I) {
     475             :     // This needs to be done before we create a new subtarget since any
     476             :     // creation will depend on the TM and the code generation flags on the
     477             :     // function that reside in TargetOptions.
     478        2492 :     resetTargetOptions(F);
     479        4984 :     I = llvm::make_unique<GCNSubtarget>(TargetTriple, GPU, FS, *this);
     480             :   }
     481             : 
     482             :   I->setScalarizeGlobalBehavior(ScalarizeGlobal);
     483             : 
     484      631056 :   return I.get();
     485             : }
     486             : 
     487             : TargetTransformInfo
     488      396115 : GCNTargetMachine::getTargetTransformInfo(const Function &F) {
     489      396115 :   return TargetTransformInfo(GCNTTIImpl(this, F));
     490             : }
     491             : 
     492             : //===----------------------------------------------------------------------===//
     493             : // AMDGPU Pass Setup
     494             : //===----------------------------------------------------------------------===//
     495             : 
     496             : namespace {
     497             : 
     498             : class AMDGPUPassConfig : public TargetPassConfig {
     499             : public:
     500        2669 :   AMDGPUPassConfig(LLVMTargetMachine &TM, PassManagerBase &PM)
     501        2669 :     : TargetPassConfig(TM, PM) {
     502             :     // Exceptions and StackMaps are not supported, so these passes will never do
     503             :     // anything.
     504        2669 :     disablePass(&StackMapLivenessID);
     505        2669 :     disablePass(&FuncletLayoutID);
     506        2669 :   }
     507             : 
     508             :   AMDGPUTargetMachine &getAMDGPUTargetMachine() const {
     509        4485 :     return getTM<AMDGPUTargetMachine>();
     510             :   }
     511             : 
     512             :   ScheduleDAGInstrs *
     513           0 :   createMachineScheduler(MachineSchedContext *C) const override {
     514           0 :     ScheduleDAGMILive *DAG = createGenericSchedLive(C);
     515           0 :     DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
     516           0 :     DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
     517           0 :     return DAG;
     518             :   }
     519             : 
     520             :   void addEarlyCSEOrGVNPass();
     521             :   void addStraightLineScalarOptimizationPasses();
     522             :   void addIRPasses() override;
     523             :   void addCodeGenPrepare() override;
     524             :   bool addPreISel() override;
     525             :   bool addInstSelector() override;
     526             :   bool addGCPasses() override;
     527             : };
     528             : 
     529             : class R600PassConfig final : public AMDGPUPassConfig {
     530             : public:
     531             :   R600PassConfig(LLVMTargetMachine &TM, PassManagerBase &PM)
     532         574 :     : AMDGPUPassConfig(TM, PM) {}
     533             : 
     534        2297 :   ScheduleDAGInstrs *createMachineScheduler(
     535             :     MachineSchedContext *C) const override {
     536        2297 :     return createR600MachineScheduler(C);
     537             :   }
     538             : 
     539             :   bool addPreISel() override;
     540             :   bool addInstSelector() override;
     541             :   void addPreRegAlloc() override;
     542             :   void addPreSched2() override;
     543             :   void addPreEmitPass() override;
     544             : };
     545             : 
     546             : class GCNPassConfig final : public AMDGPUPassConfig {
     547             : public:
     548        2382 :   GCNPassConfig(LLVMTargetMachine &TM, PassManagerBase &PM)
     549        2382 :     : AMDGPUPassConfig(TM, PM) {
     550             :     // It is necessary to know the register usage of the entire call graph.  We
     551             :     // allow calls without EnableAMDGPUFunctionCalls if they are marked
     552             :     // noinline, so this is always required.
     553        2382 :     setRequiresCodeGenSCCOrder(true);
     554        2382 :   }
     555             : 
     556             :   GCNTargetMachine &getGCNTargetMachine() const {
     557             :     return getTM<GCNTargetMachine>();
     558             :   }
     559             : 
     560             :   ScheduleDAGInstrs *
     561             :   createMachineScheduler(MachineSchedContext *C) const override;
     562             : 
     563             :   bool addPreISel() override;
     564             :   void addMachineSSAOptimization() override;
     565             :   bool addILPOpts() override;
     566             :   bool addInstSelector() override;
     567             :   bool addIRTranslator() override;
     568             :   bool addLegalizeMachineIR() override;
     569             :   bool addRegBankSelect() override;
     570             :   bool addGlobalInstructionSelect() override;
     571             :   void addFastRegAlloc(FunctionPass *RegAllocPass) override;
     572             :   void addOptimizedRegAlloc(FunctionPass *RegAllocPass) override;
     573             :   void addPreRegAlloc() override;
     574             :   void addPostRegAlloc() override;
     575             :   void addPreSched2() override;
     576             :   void addPreEmitPass() override;
     577             : };
     578             : 
     579             : } // end anonymous namespace
     580             : 
     581        4400 : void AMDGPUPassConfig::addEarlyCSEOrGVNPass() {
     582        4400 :   if (getOptLevel() == CodeGenOpt::Aggressive)
     583           0 :     addPass(createGVNPass());
     584             :   else
     585        4400 :     addPass(createEarlyCSEPass());
     586        4400 : }
     587             : 
     588        2200 : void AMDGPUPassConfig::addStraightLineScalarOptimizationPasses() {
     589        2200 :   addPass(createLICMPass());
     590        2200 :   addPass(createSeparateConstOffsetFromGEPPass());
     591        2200 :   addPass(createSpeculativeExecutionPass());
     592             :   // ReassociateGEPs exposes more opportunites for SLSR. See
     593             :   // the example in reassociate-geps-and-slsr.ll.
     594        2200 :   addPass(createStraightLineStrengthReducePass());
     595             :   // SeparateConstOffsetFromGEP and SLSR creates common expressions which GVN or
     596             :   // EarlyCSE can reuse.
     597        2200 :   addEarlyCSEOrGVNPass();
     598             :   // Run NaryReassociate after EarlyCSE/GVN to be more effective.
     599        2200 :   addPass(createNaryReassociatePass());
     600             :   // NaryReassociate on GEPs creates redundant common expressions, so run
     601             :   // EarlyCSE after it.
     602        2200 :   addPass(createEarlyCSEPass());
     603        2200 : }
     604             : 
     605        2246 : void AMDGPUPassConfig::addIRPasses() {
     606             :   const AMDGPUTargetMachine &TM = getAMDGPUTargetMachine();
     607             : 
     608             :   // There is no reason to run these.
     609        2246 :   disablePass(&StackMapLivenessID);
     610        2246 :   disablePass(&FuncletLayoutID);
     611        2246 :   disablePass(&PatchableFunctionID);
     612             : 
     613        2246 :   addPass(createAtomicExpandPass());
     614        2246 :   addPass(createAMDGPULowerIntrinsicsPass());
     615             : 
     616             :   // Function calls are not supported, so make sure we inline everything.
     617        2246 :   addPass(createAMDGPUAlwaysInlinePass());
     618        2246 :   addPass(createAlwaysInlinerLegacyPass());
     619             :   // We need to add the barrier noop pass, otherwise adding the function
     620             :   // inlining pass will cause all of the PassConfigs passes to be run
     621             :   // one function at a time, which means if we have a nodule with two
     622             :   // functions, then we will generate code for the first function
     623             :   // without ever running any passes on the second.
     624        2246 :   addPass(createBarrierNoopPass());
     625             : 
     626        2246 :   if (TM.getTargetTriple().getArch() == Triple::amdgcn) {
     627             :     // TODO: May want to move later or split into an early and late one.
     628             : 
     629        1964 :     addPass(createAMDGPUCodeGenPreparePass());
     630             :   }
     631             : 
     632             :   // Handle uses of OpenCL image2d_t, image3d_t and sampler_t arguments.
     633        2246 :   if (TM.getTargetTriple().getArch() == Triple::r600)
     634         282 :     addPass(createR600OpenCLImageTypeLoweringPass());
     635             : 
     636             :   // Replace OpenCL enqueued block function pointers with global variables.
     637        2246 :   addPass(createAMDGPUOpenCLEnqueuedBlockLoweringPass());
     638             : 
     639        2246 :   if (TM.getOptLevel() > CodeGenOpt::None) {
     640        2200 :     addPass(createInferAddressSpacesPass());
     641        2200 :     addPass(createAMDGPUPromoteAlloca());
     642             : 
     643        2200 :     if (EnableSROA)
     644        2174 :       addPass(createSROAPass());
     645             : 
     646        2200 :     addStraightLineScalarOptimizationPasses();
     647             : 
     648        2200 :     if (EnableAMDGPUAliasAnalysis) {
     649        2187 :       addPass(createAMDGPUAAWrapperPass());
     650        4374 :       addPass(createExternalAAWrapperPass([](Pass &P, Function &,
     651             :                                              AAResults &AAR) {
     652             :         if (auto *WrapperPass = P.getAnalysisIfAvailable<AMDGPUAAWrapperPass>())
     653             :           AAR.addAAResult(WrapperPass->getResult());
     654             :         }));
     655             :     }
     656             :   }
     657             : 
     658        2246 :   TargetPassConfig::addIRPasses();
     659             : 
     660             :   // EarlyCSE is not always strong enough to clean up what LSR produces. For
     661             :   // example, GVN can combine
     662             :   //
     663             :   //   %0 = add %a, %b
     664             :   //   %1 = add %b, %a
     665             :   //
     666             :   // and
     667             :   //
     668             :   //   %0 = shl nsw %a, 2
     669             :   //   %1 = shl %a, 2
     670             :   //
     671             :   // but EarlyCSE can do neither of them.
     672        2246 :   if (getOptLevel() != CodeGenOpt::None)
     673        2200 :     addEarlyCSEOrGVNPass();
     674        2246 : }
     675             : 
     676        2246 : void AMDGPUPassConfig::addCodeGenPrepare() {
     677        2246 :   if (TM->getTargetTriple().getArch() == Triple::amdgcn &&
     678             :       EnableLowerKernelArguments)
     679        1962 :     addPass(createAMDGPULowerKernelArgumentsPass());
     680             : 
     681        2246 :   TargetPassConfig::addCodeGenPrepare();
     682             : 
     683        2246 :   if (EnableLoadStoreVectorizer)
     684        2235 :     addPass(createLoadStoreVectorizerPass());
     685        2246 : }
     686             : 
     687        2246 : bool AMDGPUPassConfig::addPreISel() {
     688        2246 :   addPass(createLowerSwitchPass());
     689        2246 :   addPass(createFlattenCFGPass());
     690        2246 :   return false;
     691             : }
     692             : 
     693        1957 : bool AMDGPUPassConfig::addInstSelector() {
     694        1957 :   addPass(createAMDGPUISelDag(&getAMDGPUTargetMachine(), getOptLevel()));
     695        1957 :   return false;
     696             : }
     697             : 
     698        2246 : bool AMDGPUPassConfig::addGCPasses() {
     699             :   // Do nothing. GC is not supported.
     700        2246 :   return false;
     701             : }
     702             : 
     703             : //===----------------------------------------------------------------------===//
     704             : // R600 Pass Setup
     705             : //===----------------------------------------------------------------------===//
     706             : 
     707         282 : bool R600PassConfig::addPreISel() {
     708         282 :   AMDGPUPassConfig::addPreISel();
     709             : 
     710         282 :   if (EnableR600StructurizeCFG)
     711         280 :     addPass(createStructurizeCFGPass());
     712         282 :   return false;
     713             : }
     714             : 
     715         282 : bool R600PassConfig::addInstSelector() {
     716         282 :   addPass(createR600ISelDag(&getAMDGPUTargetMachine(), getOptLevel()));
     717         282 :   return false;
     718             : }
     719             : 
     720         282 : void R600PassConfig::addPreRegAlloc() {
     721         282 :   addPass(createR600VectorRegMerger());
     722         282 : }
     723             : 
     724         282 : void R600PassConfig::addPreSched2() {
     725         282 :   addPass(createR600EmitClauseMarkers(), false);
     726         282 :   if (EnableR600IfConvert)
     727         281 :     addPass(&IfConverterID, false);
     728         282 :   addPass(createR600ClauseMergePass(), false);
     729         282 : }
     730             : 
     731         282 : void R600PassConfig::addPreEmitPass() {
     732         282 :   addPass(createAMDGPUCFGStructurizerPass(), false);
     733         282 :   addPass(createR600ExpandSpecialInstrsPass(), false);
     734         282 :   addPass(&FinalizeMachineBundlesID, false);
     735         282 :   addPass(createR600Packetizer(), false);
     736         282 :   addPass(createR600ControlFlowFinalizer(), false);
     737         282 : }
     738             : 
     739         287 : TargetPassConfig *R600TargetMachine::createPassConfig(PassManagerBase &PM) {
     740         287 :   return new R600PassConfig(*this, PM);
     741             : }
     742             : 
     743             : //===----------------------------------------------------------------------===//
     744             : // GCN Pass Setup
     745             : //===----------------------------------------------------------------------===//
     746             : 
     747       19515 : ScheduleDAGInstrs *GCNPassConfig::createMachineScheduler(
     748             :   MachineSchedContext *C) const {
     749       19515 :   const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>();
     750       19515 :   if (ST.enableSIScheduler())
     751           0 :     return createSIMachineScheduler(C);
     752       19515 :   return createGCNMaxOccupancyMachineScheduler(C);
     753             : }
     754             : 
     755        1964 : bool GCNPassConfig::addPreISel() {
     756        1964 :   AMDGPUPassConfig::addPreISel();
     757             : 
     758        1964 :   if (EnableAtomicOptimizations) {
     759          15 :     addPass(createAMDGPUAtomicOptimizerPass());
     760             :   }
     761             : 
     762             :   // FIXME: We need to run a pass to propagate the attributes when calls are
     763             :   // supported.
     764        1964 :   addPass(createAMDGPUAnnotateKernelFeaturesPass());
     765             : 
     766             :   // Merge divergent exit nodes. StructurizeCFG won't recognize the multi-exit
     767             :   // regions formed by them.
     768        1964 :   addPass(&AMDGPUUnifyDivergentExitNodesID);
     769        1964 :   if (!LateCFGStructurize) {
     770        1964 :     addPass(createStructurizeCFGPass(true)); // true -> SkipUniformRegions
     771             :   }
     772        1964 :   addPass(createSinkingPass());
     773        1964 :   addPass(createAMDGPUAnnotateUniformValues());
     774        1964 :   if (!LateCFGStructurize) {
     775        1964 :     addPass(createSIAnnotateControlFlowPass());
     776             :   }
     777             : 
     778        1964 :   return false;
     779             : }
     780             : 
     781        1919 : void GCNPassConfig::addMachineSSAOptimization() {
     782        1919 :   TargetPassConfig::addMachineSSAOptimization();
     783             : 
     784             :   // We want to fold operands after PeepholeOptimizer has run (or as part of
     785             :   // it), because it will eliminate extra copies making it easier to fold the
     786             :   // real source operand. We want to eliminate dead instructions after, so that
     787             :   // we see fewer uses of the copies. We then need to clean up the dead
     788             :   // instructions leftover after the operands are folded as well.
     789             :   //
     790             :   // XXX - Can we get away without running DeadMachineInstructionElim again?
     791        1919 :   addPass(&SIFoldOperandsID);
     792        1919 :   addPass(&DeadMachineInstructionElimID);
     793        1919 :   addPass(&SILoadStoreOptimizerID);
     794        1919 :   if (EnableSDWAPeephole) {
     795        1915 :     addPass(&SIPeepholeSDWAID);
     796        1915 :     addPass(&EarlyMachineLICMID);
     797        1915 :     addPass(&MachineCSEID);
     798        1915 :     addPass(&SIFoldOperandsID);
     799        1915 :     addPass(&DeadMachineInstructionElimID);
     800             :   }
     801        1919 :   addPass(createSIShrinkInstructionsPass());
     802        1919 : }
     803             : 
     804        1919 : bool GCNPassConfig::addILPOpts() {
     805        1919 :   if (EnableEarlyIfConversion)
     806           2 :     addPass(&EarlyIfConverterID);
     807             : 
     808             :   TargetPassConfig::addILPOpts();
     809        1919 :   return false;
     810             : }
     811             : 
     812        1957 : bool GCNPassConfig::addInstSelector() {
     813        1957 :   AMDGPUPassConfig::addInstSelector();
     814        1957 :   addPass(createSILowerI1CopiesPass());
     815        1957 :   addPass(&SIFixSGPRCopiesID);
     816        1957 :   return false;
     817             : }
     818             : 
     819           8 : bool GCNPassConfig::addIRTranslator() {
     820           8 :   addPass(new IRTranslator());
     821           8 :   return false;
     822             : }
     823             : 
     824           8 : bool GCNPassConfig::addLegalizeMachineIR() {
     825           8 :   addPass(new Legalizer());
     826           8 :   return false;
     827             : }
     828             : 
     829           8 : bool GCNPassConfig::addRegBankSelect() {
     830           8 :   addPass(new RegBankSelect());
     831           8 :   return false;
     832             : }
     833             : 
     834           8 : bool GCNPassConfig::addGlobalInstructionSelect() {
     835           8 :   addPass(new InstructionSelect());
     836           8 :   return false;
     837             : }
     838             : 
     839        1964 : void GCNPassConfig::addPreRegAlloc() {
     840        1964 :   if (LateCFGStructurize) {
     841           0 :     addPass(createAMDGPUMachineCFGStructurizerPass());
     842             :   }
     843        1964 :   addPass(createSIWholeQuadModePass());
     844        1964 : }
     845             : 
     846          45 : void GCNPassConfig::addFastRegAlloc(FunctionPass *RegAllocPass) {
     847             :   // FIXME: We have to disable the verifier here because of PHIElimination +
     848             :   // TwoAddressInstructions disabling it.
     849             : 
     850             :   // This must be run immediately after phi elimination and before
     851             :   // TwoAddressInstructions, otherwise the processing of the tied operand of
     852             :   // SI_ELSE will introduce a copy of the tied operand source after the else.
     853          90 :   insertPass(&PHIEliminationID, &SILowerControlFlowID, false);
     854             : 
     855             :   // This must be run after SILowerControlFlow, since it needs to use the
     856             :   // machine-level CFG, but before register allocation.
     857          90 :   insertPass(&SILowerControlFlowID, &SIFixWWMLivenessID, false);
     858             : 
     859          45 :   TargetPassConfig::addFastRegAlloc(RegAllocPass);
     860          45 : }
     861             : 
     862        1919 : void GCNPassConfig::addOptimizedRegAlloc(FunctionPass *RegAllocPass) {
     863        3838 :   insertPass(&MachineSchedulerID, &SIOptimizeExecMaskingPreRAID);
     864             : 
     865        3838 :   insertPass(&SIOptimizeExecMaskingPreRAID, &SIFormMemoryClausesID);
     866             : 
     867             :   // This must be run immediately after phi elimination and before
     868             :   // TwoAddressInstructions, otherwise the processing of the tied operand of
     869             :   // SI_ELSE will introduce a copy of the tied operand source after the else.
     870        3838 :   insertPass(&PHIEliminationID, &SILowerControlFlowID, false);
     871             : 
     872             :   // This must be run after SILowerControlFlow, since it needs to use the
     873             :   // machine-level CFG, but before register allocation.
     874        3838 :   insertPass(&SILowerControlFlowID, &SIFixWWMLivenessID, false);
     875             : 
     876        1919 :   TargetPassConfig::addOptimizedRegAlloc(RegAllocPass);
     877        1919 : }
     878             : 
     879        1964 : void GCNPassConfig::addPostRegAlloc() {
     880        1964 :   addPass(&SIFixVGPRCopiesID);
     881        1964 :   addPass(&SIOptimizeExecMaskingID);
     882             :   TargetPassConfig::addPostRegAlloc();
     883        1964 : }
     884             : 
     885        1964 : void GCNPassConfig::addPreSched2() {
     886        1964 : }
     887             : 
     888        1964 : void GCNPassConfig::addPreEmitPass() {
     889        1964 :   addPass(createSIMemoryLegalizerPass());
     890        1964 :   addPass(createSIInsertWaitcntsPass());
     891        1964 :   addPass(createSIShrinkInstructionsPass());
     892             : 
     893             :   // The hazard recognizer that runs as part of the post-ra scheduler does not
     894             :   // guarantee to be able handle all hazards correctly. This is because if there
     895             :   // are multiple scheduling regions in a basic block, the regions are scheduled
     896             :   // bottom up, so when we begin to schedule a region we don't know what
     897             :   // instructions were emitted directly before it.
     898             :   //
     899             :   // Here we add a stand-alone hazard recognizer pass which can handle all
     900             :   // cases.
     901             :   //
     902             :   // FIXME: This stand-alone pass will emit indiv. S_NOP 0, as needed. It would
     903             :   // be better for it to emit S_NOP <N> when possible.
     904        1964 :   addPass(&PostRAHazardRecognizerID);
     905             : 
     906        1964 :   addPass(&SIInsertSkipsPassID);
     907        1964 :   addPass(createSIDebuggerInsertNopsPass());
     908        1964 :   addPass(&BranchRelaxationPassID);
     909        1964 : }
     910             : 
     911        2382 : TargetPassConfig *GCNTargetMachine::createPassConfig(PassManagerBase &PM) {
     912        2382 :   return new GCNPassConfig(*this, PM);
     913             : }

Generated by: LCOV version 1.13