LCOV - code coverage report
Current view: top level - lib/Target/AMDGPU - AMDGPUTargetMachine.cpp (source / functions) Hit Total Coverage
Test: llvm-toolchain.info Lines: 392 400 98.0 %
Date: 2018-05-20 00:06:23 Functions: 59 64 92.2 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : //===-- AMDGPUTargetMachine.cpp - TargetMachine for hw codegen targets-----===//
       2             : //
       3             : //                     The LLVM Compiler Infrastructure
       4             : //
       5             : // This file is distributed under the University of Illinois Open Source
       6             : // License. See LICENSE.TXT for details.
       7             : //
       8             : //===----------------------------------------------------------------------===//
       9             : //
      10             : /// \file
      11             : /// The AMDGPU target machine contains all of the hardware specific
      12             : /// information  needed to emit code for R600 and SI GPUs.
      13             : //
      14             : //===----------------------------------------------------------------------===//
      15             : 
      16             : #include "AMDGPUTargetMachine.h"
      17             : #include "AMDGPU.h"
      18             : #include "AMDGPUAliasAnalysis.h"
      19             : #include "AMDGPUCallLowering.h"
      20             : #include "AMDGPUInstructionSelector.h"
      21             : #include "AMDGPULegalizerInfo.h"
      22             : #include "AMDGPUMacroFusion.h"
      23             : #include "AMDGPUTargetObjectFile.h"
      24             : #include "AMDGPUTargetTransformInfo.h"
      25             : #include "GCNIterativeScheduler.h"
      26             : #include "GCNSchedStrategy.h"
      27             : #include "R600MachineScheduler.h"
      28             : #include "SIMachineScheduler.h"
      29             : #include "llvm/CodeGen/GlobalISel/IRTranslator.h"
      30             : #include "llvm/CodeGen/GlobalISel/InstructionSelect.h"
      31             : #include "llvm/CodeGen/GlobalISel/Legalizer.h"
      32             : #include "llvm/CodeGen/GlobalISel/RegBankSelect.h"
      33             : #include "llvm/CodeGen/Passes.h"
      34             : #include "llvm/CodeGen/TargetPassConfig.h"
      35             : #include "llvm/IR/Attributes.h"
      36             : #include "llvm/IR/Function.h"
      37             : #include "llvm/IR/LegacyPassManager.h"
      38             : #include "llvm/Pass.h"
      39             : #include "llvm/Support/CommandLine.h"
      40             : #include "llvm/Support/Compiler.h"
      41             : #include "llvm/Support/TargetRegistry.h"
      42             : #include "llvm/Target/TargetLoweringObjectFile.h"
      43             : #include "llvm/Transforms/IPO.h"
      44             : #include "llvm/Transforms/IPO/AlwaysInliner.h"
      45             : #include "llvm/Transforms/IPO/PassManagerBuilder.h"
      46             : #include "llvm/Transforms/Scalar.h"
      47             : #include "llvm/Transforms/Scalar/GVN.h"
      48             : #include "llvm/Transforms/Vectorize.h"
      49             : #include <memory>
      50             : 
      51             : using namespace llvm;
      52             : 
      53       99237 : static cl::opt<bool> EnableR600StructurizeCFG(
      54             :   "r600-ir-structurize",
      55       99237 :   cl::desc("Use StructurizeCFG IR pass"),
      56      297711 :   cl::init(true));
      57             : 
      58       99237 : static cl::opt<bool> EnableSROA(
      59             :   "amdgpu-sroa",
      60       99237 :   cl::desc("Run SROA after promote alloca pass"),
      61             :   cl::ReallyHidden,
      62      297711 :   cl::init(true));
      63             : 
      64             : static cl::opt<bool>
      65       99237 : EnableEarlyIfConversion("amdgpu-early-ifcvt", cl::Hidden,
      66       99237 :                         cl::desc("Run early if-conversion"),
      67      297711 :                         cl::init(false));
      68             : 
      69       99237 : static cl::opt<bool> EnableR600IfConvert(
      70             :   "r600-if-convert",
      71       99237 :   cl::desc("Use if conversion pass"),
      72             :   cl::ReallyHidden,
      73      297711 :   cl::init(true));
      74             : 
      75             : // Option to disable vectorizer for tests.
      76       99237 : static cl::opt<bool> EnableLoadStoreVectorizer(
      77             :   "amdgpu-load-store-vectorizer",
      78       99237 :   cl::desc("Enable load store vectorizer"),
      79      198474 :   cl::init(true),
      80      297711 :   cl::Hidden);
      81             : 
      82             : // Option to control global loads scalarization
      83       99237 : static cl::opt<bool> ScalarizeGlobal(
      84             :   "amdgpu-scalarize-global-loads",
      85       99237 :   cl::desc("Enable global load scalarization"),
      86      198474 :   cl::init(true),
      87      297711 :   cl::Hidden);
      88             : 
      89             : // Option to run internalize pass.
      90       99237 : static cl::opt<bool> InternalizeSymbols(
      91             :   "amdgpu-internalize-symbols",
      92       99237 :   cl::desc("Enable elimination of non-kernel functions and unused globals"),
      93      198474 :   cl::init(false),
      94      297711 :   cl::Hidden);
      95             : 
      96             : // Option to inline all early.
      97       99237 : static cl::opt<bool> EarlyInlineAll(
      98             :   "amdgpu-early-inline-all",
      99       99237 :   cl::desc("Inline all functions early"),
     100      198474 :   cl::init(false),
     101      297711 :   cl::Hidden);
     102             : 
     103       99237 : static cl::opt<bool> EnableSDWAPeephole(
     104             :   "amdgpu-sdwa-peephole",
     105       99237 :   cl::desc("Enable SDWA peepholer"),
     106      297711 :   cl::init(true));
     107             : 
     108             : // Enable address space based alias analysis
     109       99237 : static cl::opt<bool> EnableAMDGPUAliasAnalysis("enable-amdgpu-aa", cl::Hidden,
     110       99237 :   cl::desc("Enable AMDGPU Alias Analysis"),
     111      297711 :   cl::init(true));
     112             : 
     113             : // Option to enable new waitcnt insertion pass.
     114       99237 : static cl::opt<bool> EnableSIInsertWaitcntsPass(
     115             :   "enable-si-insert-waitcnts",
     116       99237 :   cl::desc("Use new waitcnt insertion pass"),
     117      297711 :   cl::init(true));
     118             : 
     119             : // Option to run late CFG structurizer
     120       99237 : static cl::opt<bool, true> LateCFGStructurize(
     121             :   "amdgpu-late-structurize",
     122       99237 :   cl::desc("Enable late CFG structurization"),
     123      198474 :   cl::location(AMDGPUTargetMachine::EnableLateStructurizeCFG),
     124      297711 :   cl::Hidden);
     125             : 
     126       99237 : static cl::opt<bool> EnableAMDGPUFunctionCalls(
     127             :   "amdgpu-function-calls",
     128             :   cl::Hidden,
     129       99237 :   cl::desc("Enable AMDGPU function call support"),
     130      297711 :   cl::init(false));
     131             : 
     132             : // Enable lib calls simplifications
     133       99237 : static cl::opt<bool> EnableLibCallSimplify(
     134             :   "amdgpu-simplify-libcall",
     135       99237 :   cl::desc("Enable mdgpu library simplifications"),
     136      198474 :   cl::init(true),
     137      297711 :   cl::Hidden);
     138             : 
     139      100810 : extern "C" void LLVMInitializeAMDGPUTarget() {
     140             :   // Register the target
     141      100810 :   RegisterTargetMachine<R600TargetMachine> X(getTheAMDGPUTarget());
     142      100810 :   RegisterTargetMachine<GCNTargetMachine> Y(getTheGCNTarget());
     143             : 
     144      100810 :   PassRegistry *PR = PassRegistry::getPassRegistry();
     145      100810 :   initializeR600ClauseMergePassPass(*PR);
     146      100810 :   initializeR600ControlFlowFinalizerPass(*PR);
     147      100810 :   initializeR600PacketizerPass(*PR);
     148      100810 :   initializeR600ExpandSpecialInstrsPassPass(*PR);
     149      100810 :   initializeR600VectorRegMergerPass(*PR);
     150      100810 :   initializeGlobalISel(*PR);
     151      100810 :   initializeAMDGPUDAGToDAGISelPass(*PR);
     152      100810 :   initializeSILowerI1CopiesPass(*PR);
     153      100810 :   initializeSIFixSGPRCopiesPass(*PR);
     154      100810 :   initializeSIFixVGPRCopiesPass(*PR);
     155      100810 :   initializeSIFoldOperandsPass(*PR);
     156      100810 :   initializeSIPeepholeSDWAPass(*PR);
     157      100810 :   initializeSIShrinkInstructionsPass(*PR);
     158      100810 :   initializeSIOptimizeExecMaskingPreRAPass(*PR);
     159      100810 :   initializeSILoadStoreOptimizerPass(*PR);
     160      100810 :   initializeAMDGPUAlwaysInlinePass(*PR);
     161      100810 :   initializeAMDGPUAnnotateKernelFeaturesPass(*PR);
     162      100810 :   initializeAMDGPUAnnotateUniformValuesPass(*PR);
     163      100810 :   initializeAMDGPUArgumentUsageInfoPass(*PR);
     164      100810 :   initializeAMDGPULowerIntrinsicsPass(*PR);
     165      100810 :   initializeAMDGPUOpenCLEnqueuedBlockLoweringPass(*PR);
     166      100810 :   initializeAMDGPUPromoteAllocaPass(*PR);
     167      100810 :   initializeAMDGPUCodeGenPreparePass(*PR);
     168      100810 :   initializeAMDGPURewriteOutArgumentsPass(*PR);
     169      100810 :   initializeAMDGPUUnifyMetadataPass(*PR);
     170      100810 :   initializeSIAnnotateControlFlowPass(*PR);
     171      100810 :   initializeSIInsertWaitcntsPass(*PR);
     172      100810 :   initializeSIWholeQuadModePass(*PR);
     173      100810 :   initializeSILowerControlFlowPass(*PR);
     174      100810 :   initializeSIInsertSkipsPass(*PR);
     175      100810 :   initializeSIMemoryLegalizerPass(*PR);
     176      100810 :   initializeSIDebuggerInsertNopsPass(*PR);
     177      100810 :   initializeSIOptimizeExecMaskingPass(*PR);
     178      100810 :   initializeSIFixWWMLivenessPass(*PR);
     179      100810 :   initializeAMDGPUUnifyDivergentExitNodesPass(*PR);
     180      100810 :   initializeAMDGPUAAWrapperPassPass(*PR);
     181      100810 :   initializeAMDGPUUseNativeCallsPass(*PR);
     182      100810 :   initializeAMDGPUSimplifyLibCallsPass(*PR);
     183      100810 :   initializeAMDGPUInlinerPass(*PR);
     184      100810 : }
     185             : 
     186             : static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) {
     187        5016 :   return llvm::make_unique<AMDGPUTargetObjectFile>();
     188             : }
     189             : 
     190        2229 : static ScheduleDAGInstrs *createR600MachineScheduler(MachineSchedContext *C) {
     191        6687 :   return new ScheduleDAGMILive(C, llvm::make_unique<R600SchedStrategy>());
     192             : }
     193             : 
     194           1 : static ScheduleDAGInstrs *createSIMachineScheduler(MachineSchedContext *C) {
     195           1 :   return new SIScheduleDAGMI(C);
     196             : }
     197             : 
     198             : static ScheduleDAGInstrs *
     199       17325 : createGCNMaxOccupancyMachineScheduler(MachineSchedContext *C) {
     200             :   ScheduleDAGMILive *DAG =
     201       34650 :     new GCNScheduleDAGMILive(C, make_unique<GCNMaxOccupancySchedStrategy>(C));
     202       34650 :   DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
     203       34650 :   DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
     204       34650 :   DAG->addMutation(createAMDGPUMacroFusionDAGMutation());
     205       17325 :   return DAG;
     206             : }
     207             : 
     208             : static ScheduleDAGInstrs *
     209           3 : createIterativeGCNMaxOccupancyMachineScheduler(MachineSchedContext *C) {
     210             :   auto DAG = new GCNIterativeScheduler(C,
     211           3 :     GCNIterativeScheduler::SCHEDULE_LEGACYMAXOCCUPANCY);
     212           6 :   DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
     213           6 :   DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
     214           3 :   return DAG;
     215             : }
     216             : 
     217           3 : static ScheduleDAGInstrs *createMinRegScheduler(MachineSchedContext *C) {
     218             :   return new GCNIterativeScheduler(C,
     219           3 :     GCNIterativeScheduler::SCHEDULE_MINREGFORCED);
     220             : }
     221             : 
     222             : static ScheduleDAGInstrs *
     223           2 : createIterativeILPMachineScheduler(MachineSchedContext *C) {
     224             :   auto DAG = new GCNIterativeScheduler(C,
     225           2 :     GCNIterativeScheduler::SCHEDULE_ILP);
     226           4 :   DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
     227           4 :   DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
     228           4 :   DAG->addMutation(createAMDGPUMacroFusionDAGMutation());
     229           2 :   return DAG;
     230             : }
     231             : 
     232             : static MachineSchedRegistry
     233       99237 : R600SchedRegistry("r600", "Run R600's custom scheduler",
     234             :                    createR600MachineScheduler);
     235             : 
     236             : static MachineSchedRegistry
     237       99237 : SISchedRegistry("si", "Run SI's custom scheduler",
     238             :                 createSIMachineScheduler);
     239             : 
     240             : static MachineSchedRegistry
     241       99237 : GCNMaxOccupancySchedRegistry("gcn-max-occupancy",
     242             :                              "Run GCN scheduler to maximize occupancy",
     243             :                              createGCNMaxOccupancyMachineScheduler);
     244             : 
     245             : static MachineSchedRegistry
     246       99237 : IterativeGCNMaxOccupancySchedRegistry("gcn-max-occupancy-experimental",
     247             :   "Run GCN scheduler to maximize occupancy (experimental)",
     248             :   createIterativeGCNMaxOccupancyMachineScheduler);
     249             : 
     250             : static MachineSchedRegistry
     251       99237 : GCNMinRegSchedRegistry("gcn-minreg",
     252             :   "Run GCN iterative scheduler for minimal register usage (experimental)",
     253             :   createMinRegScheduler);
     254             : 
     255             : static MachineSchedRegistry
     256       99237 : GCNILPSchedRegistry("gcn-ilp",
     257             :   "Run GCN iterative scheduler for ILP scheduling (experimental)",
     258             :   createIterativeILPMachineScheduler);
     259             : 
     260             : static StringRef computeDataLayout(const Triple &TT) {
     261        2508 :   if (TT.getArch() == Triple::r600) {
     262             :     // 32-bit pointers.
     263             :       return "e-p:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128"
     264             :              "-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5";
     265             :   }
     266             : 
     267             :   // 32-bit private, local, and region pointers. 64-bit global, constant and
     268             :   // flat.
     269             :     return "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32"
     270             :          "-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128"
     271             :          "-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5";
     272             : }
     273             : 
     274             : LLVM_READNONE
     275             : static StringRef getGPUOrDefault(const Triple &TT, StringRef GPU) {
     276        2508 :   if (!GPU.empty())
     277             :     return GPU;
     278             : 
     279         702 :   if (TT.getArch() == Triple::amdgcn)
     280             :     return "generic";
     281             : 
     282             :   return "r600";
     283             : }
     284             : 
     285             : static Reloc::Model getEffectiveRelocModel(Optional<Reloc::Model> RM) {
     286             :   // The AMDGPU toolchain only supports generating shared objects, so we
     287             :   // must always use PIC.
     288             :   return Reloc::PIC_;
     289             : }
     290             : 
     291             : static CodeModel::Model getEffectiveCodeModel(Optional<CodeModel::Model> CM) {
     292        2508 :   if (CM)
     293             :     return *CM;
     294             :   return CodeModel::Small;
     295             : }
     296             : 
     297        2508 : AMDGPUTargetMachine::AMDGPUTargetMachine(const Target &T, const Triple &TT,
     298             :                                          StringRef CPU, StringRef FS,
     299             :                                          TargetOptions Options,
     300             :                                          Optional<Reloc::Model> RM,
     301             :                                          Optional<CodeModel::Model> CM,
     302        2508 :                                          CodeGenOpt::Level OptLevel)
     303             :     : LLVMTargetMachine(T, computeDataLayout(TT), TT, getGPUOrDefault(TT, CPU),
     304             :                         FS, Options, getEffectiveRelocModel(RM),
     305             :                         getEffectiveCodeModel(CM), OptLevel),
     306        7524 :       TLOF(createTLOF(getTargetTriple())) {
     307        5016 :   AS = AMDGPU::getAMDGPUAS(TT);
     308        2508 :   initAsmInfo();
     309        2508 : }
     310             : 
     311             : AMDGPUTargetMachine::~AMDGPUTargetMachine() = default;
     312             : 
     313             : bool AMDGPUTargetMachine::EnableLateStructurizeCFG = false;
     314             : 
     315      570007 : StringRef AMDGPUTargetMachine::getGPUName(const Function &F) const {
     316      570007 :   Attribute GPUAttr = F.getFnAttribute("target-cpu");
     317      570007 :   return GPUAttr.hasAttribute(Attribute::None) ?
     318     1024445 :     getTargetCPU() : GPUAttr.getValueAsString();
     319             : }
     320             : 
     321      570007 : StringRef AMDGPUTargetMachine::getFeatureString(const Function &F) const {
     322      570007 :   Attribute FSAttr = F.getFnAttribute("target-features");
     323             : 
     324      570007 :   return FSAttr.hasAttribute(Attribute::None) ?
     325             :     getTargetFeatureString() :
     326      776196 :     FSAttr.getValueAsString();
     327             : }
     328             : 
     329         118 : static ImmutablePass *createAMDGPUExternalAAWrapperPass() {
     330       13121 :   return createExternalAAWrapperPass([](Pass &P, Function &, AAResults &AAR) {
     331       13003 :       if (auto *WrapperPass = P.getAnalysisIfAvailable<AMDGPUAAWrapperPass>())
     332       13003 :         AAR.addAAResult(WrapperPass->getResult());
     333       13239 :       });
     334             : }
     335             : 
     336             : /// Predicate for Internalize pass.
     337          14 : static bool mustPreserveGV(const GlobalValue &GV) {
     338             :   if (const Function *F = dyn_cast<Function>(&GV))
     339          20 :     return F->isDeclaration() || AMDGPU::isEntryFunctionCC(F->getCallingConv());
     340             : 
     341           4 :   return !GV.use_empty();
     342             : }
     343             : 
     344         106 : void AMDGPUTargetMachine::adjustPassManager(PassManagerBuilder &Builder) {
     345         106 :   Builder.DivergentTarget = true;
     346             : 
     347         106 :   bool EnableOpt = getOptLevel() > CodeGenOpt::None;
     348             :   bool Internalize = InternalizeSymbols;
     349         107 :   bool EarlyInline = EarlyInlineAll && EnableOpt && !EnableAMDGPUFunctionCalls;
     350         106 :   bool AMDGPUAA = EnableAMDGPUAliasAnalysis && EnableOpt;
     351         106 :   bool LibCallSimplify = EnableLibCallSimplify && EnableOpt;
     352             : 
     353         106 :   if (EnableAMDGPUFunctionCalls) {
     354           2 :     delete Builder.Inliner;
     355           2 :     Builder.Inliner = createAMDGPUFunctionInliningPass();
     356             :   }
     357             : 
     358         106 :   if (Internalize) {
     359             :     // If we're generating code, we always have the whole program available. The
     360             :     // relocations expected for externally visible functions aren't supported,
     361             :     // so make sure every non-entry function is hidden.
     362           4 :     Builder.addExtension(
     363             :       PassManagerBuilder::EP_EnabledOnOptLevel0,
     364           1 :       [](const PassManagerBuilder &, legacy::PassManagerBase &PM) {
     365           3 :         PM.add(createInternalizePass(mustPreserveGV));
     366           1 :       });
     367             :   }
     368             : 
     369         318 :   Builder.addExtension(
     370             :     PassManagerBuilder::EP_ModuleOptimizerEarly,
     371             :     [Internalize, EarlyInline, AMDGPUAA](const PassManagerBuilder &,
     372         177 :                                          legacy::PassManagerBase &PM) {
     373          59 :       if (AMDGPUAA) {
     374          59 :         PM.add(createAMDGPUAAWrapperPass());
     375          59 :         PM.add(createAMDGPUExternalAAWrapperPass());
     376             :       }
     377          59 :       PM.add(createAMDGPUUnifyMetadataPass());
     378          59 :       if (Internalize) {
     379           3 :         PM.add(createInternalizePass(mustPreserveGV));
     380           1 :         PM.add(createGlobalDCEPass());
     381             :       }
     382          59 :       if (EarlyInline)
     383           1 :         PM.add(createAMDGPUAlwaysInlinePass(false));
     384          59 :   });
     385             : 
     386         106 :   const auto &Opt = Options;
     387         318 :   Builder.addExtension(
     388             :     PassManagerBuilder::EP_EarlyAsPossible,
     389             :     [AMDGPUAA, LibCallSimplify, &Opt](const PassManagerBuilder &,
     390         271 :                                       legacy::PassManagerBase &PM) {
     391         106 :       if (AMDGPUAA) {
     392          59 :         PM.add(createAMDGPUAAWrapperPass());
     393          59 :         PM.add(createAMDGPUExternalAAWrapperPass());
     394             :       }
     395         106 :       PM.add(llvm::createAMDGPUUseNativeCallsPass());
     396         106 :       if (LibCallSimplify)
     397         118 :         PM.add(llvm::createAMDGPUSimplifyLibCallsPass(Opt));
     398         106 :   });
     399             : 
     400         212 :   Builder.addExtension(
     401             :     PassManagerBuilder::EP_CGSCCOptimizerLate,
     402             :     [](const PassManagerBuilder &, legacy::PassManagerBase &PM) {
     403             :       // Add infer address spaces pass to the opt pipeline after inlining
     404             :       // but before SROA to increase SROA opportunities.
     405          59 :       PM.add(createInferAddressSpacesPass());
     406             :   });
     407         106 : }
     408             : 
     409             : //===----------------------------------------------------------------------===//
     410             : // R600 Target Machine (R600 -> Cayman)
     411             : //===----------------------------------------------------------------------===//
     412             : 
     413         288 : R600TargetMachine::R600TargetMachine(const Target &T, const Triple &TT,
     414             :                                      StringRef CPU, StringRef FS,
     415             :                                      TargetOptions Options,
     416             :                                      Optional<Reloc::Model> RM,
     417             :                                      Optional<CodeModel::Model> CM,
     418         288 :                                      CodeGenOpt::Level OL, bool JIT)
     419         864 :     : AMDGPUTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL) {
     420             :   setRequiresStructuredCFG(true);
     421         288 : }
     422             : 
     423       53095 : const R600Subtarget *R600TargetMachine::getSubtargetImpl(
     424             :   const Function &F) const {
     425       53095 :   StringRef GPU = getGPUName(F);
     426       53095 :   StringRef FS = getFeatureString(F);
     427             : 
     428             :   SmallString<128> SubtargetKey(GPU);
     429             :   SubtargetKey.append(FS);
     430             : 
     431       53095 :   auto &I = SubtargetMap[SubtargetKey];
     432       53095 :   if (!I) {
     433             :     // This needs to be done before we create a new subtarget since any
     434             :     // creation will depend on the TM and the code generation flags on the
     435             :     // function that reside in TargetOptions.
     436         284 :     resetTargetOptions(F);
     437         284 :     I = llvm::make_unique<R600Subtarget>(TargetTriple, GPU, FS, *this);
     438             :   }
     439             : 
     440       53095 :   return I.get();
     441             : }
     442             : 
     443             : //===----------------------------------------------------------------------===//
     444             : // GCN Target Machine (SI+)
     445             : //===----------------------------------------------------------------------===//
     446             : 
     447        2220 : GCNTargetMachine::GCNTargetMachine(const Target &T, const Triple &TT,
     448             :                                    StringRef CPU, StringRef FS,
     449             :                                    TargetOptions Options,
     450             :                                    Optional<Reloc::Model> RM,
     451             :                                    Optional<CodeModel::Model> CM,
     452        2220 :                                    CodeGenOpt::Level OL, bool JIT)
     453        8880 :     : AMDGPUTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL) {}
     454             : 
     455      516912 : const SISubtarget *GCNTargetMachine::getSubtargetImpl(const Function &F) const {
     456      516912 :   StringRef GPU = getGPUName(F);
     457      516912 :   StringRef FS = getFeatureString(F);
     458             : 
     459             :   SmallString<128> SubtargetKey(GPU);
     460             :   SubtargetKey.append(FS);
     461             : 
     462      516912 :   auto &I = SubtargetMap[SubtargetKey];
     463      516912 :   if (!I) {
     464             :     // This needs to be done before we create a new subtarget since any
     465             :     // creation will depend on the TM and the code generation flags on the
     466             :     // function that reside in TargetOptions.
     467        2205 :     resetTargetOptions(F);
     468        2205 :     I = llvm::make_unique<SISubtarget>(TargetTriple, GPU, FS, *this);
     469             :   }
     470             : 
     471             :   I->setScalarizeGlobalBehavior(ScalarizeGlobal);
     472             : 
     473      516912 :   return I.get();
     474             : }
     475             : 
     476             : //===----------------------------------------------------------------------===//
     477             : // AMDGPU Pass Setup
     478             : //===----------------------------------------------------------------------===//
     479             : 
     480             : namespace {
     481             : 
     482        2384 : class AMDGPUPassConfig : public TargetPassConfig {
     483             : public:
     484        2394 :   AMDGPUPassConfig(LLVMTargetMachine &TM, PassManagerBase &PM)
     485        2394 :     : TargetPassConfig(TM, PM) {
     486             :     // Exceptions and StackMaps are not supported, so these passes will never do
     487             :     // anything.
     488        2394 :     disablePass(&StackMapLivenessID);
     489        2394 :     disablePass(&FuncletLayoutID);
     490        2394 :   }
     491             : 
     492             :   AMDGPUTargetMachine &getAMDGPUTargetMachine() const {
     493        4084 :     return getTM<AMDGPUTargetMachine>();
     494             :   }
     495             : 
     496             :   ScheduleDAGInstrs *
     497           0 :   createMachineScheduler(MachineSchedContext *C) const override {
     498           0 :     ScheduleDAGMILive *DAG = createGenericSchedLive(C);
     499           0 :     DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
     500           0 :     DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
     501           0 :     return DAG;
     502             :   }
     503             : 
     504             :   void addEarlyCSEOrGVNPass();
     505             :   void addStraightLineScalarOptimizationPasses();
     506             :   void addIRPasses() override;
     507             :   void addCodeGenPrepare() override;
     508             :   bool addPreISel() override;
     509             :   bool addInstSelector() override;
     510             :   bool addGCPasses() override;
     511             : };
     512             : 
     513         558 : class R600PassConfig final : public AMDGPUPassConfig {
     514             : public:
     515             :   R600PassConfig(LLVMTargetMachine &TM, PassManagerBase &PM)
     516         280 :     : AMDGPUPassConfig(TM, PM) {}
     517             : 
     518        2229 :   ScheduleDAGInstrs *createMachineScheduler(
     519             :     MachineSchedContext *C) const override {
     520        2229 :     return createR600MachineScheduler(C);
     521             :   }
     522             : 
     523             :   bool addPreISel() override;
     524             :   bool addInstSelector() override;
     525             :   void addPreRegAlloc() override;
     526             :   void addPreSched2() override;
     527             :   void addPreEmitPass() override;
     528             : };
     529             : 
     530        4210 : class GCNPassConfig final : public AMDGPUPassConfig {
     531             : public:
     532        2114 :   GCNPassConfig(LLVMTargetMachine &TM, PassManagerBase &PM)
     533        2114 :     : AMDGPUPassConfig(TM, PM) {
     534             :     // It is necessary to know the register usage of the entire call graph.  We
     535             :     // allow calls without EnableAMDGPUFunctionCalls if they are marked
     536             :     // noinline, so this is always required.
     537        2114 :     setRequiresCodeGenSCCOrder(true);
     538        2114 :   }
     539             : 
     540             :   GCNTargetMachine &getGCNTargetMachine() const {
     541             :     return getTM<GCNTargetMachine>();
     542             :   }
     543             : 
     544             :   ScheduleDAGInstrs *
     545             :   createMachineScheduler(MachineSchedContext *C) const override;
     546             : 
     547             :   bool addPreISel() override;
     548             :   void addMachineSSAOptimization() override;
     549             :   bool addILPOpts() override;
     550             :   bool addInstSelector() override;
     551             :   bool addIRTranslator() override;
     552             :   bool addLegalizeMachineIR() override;
     553             :   bool addRegBankSelect() override;
     554             :   bool addGlobalInstructionSelect() override;
     555             :   void addFastRegAlloc(FunctionPass *RegAllocPass) override;
     556             :   void addOptimizedRegAlloc(FunctionPass *RegAllocPass) override;
     557             :   void addPreRegAlloc() override;
     558             :   void addPostRegAlloc() override;
     559             :   void addPreSched2() override;
     560             :   void addPreEmitPass() override;
     561             : };
     562             : 
     563             : } // end anonymous namespace
     564             : 
     565             : TargetTransformInfo
     566      414597 : AMDGPUTargetMachine::getTargetTransformInfo(const Function &F) {
     567      414597 :   return TargetTransformInfo(AMDGPUTTIImpl(this, F));
     568             : }
     569             : 
     570        4000 : void AMDGPUPassConfig::addEarlyCSEOrGVNPass() {
     571        4000 :   if (getOptLevel() == CodeGenOpt::Aggressive)
     572           0 :     addPass(createGVNPass());
     573             :   else
     574        4000 :     addPass(createEarlyCSEPass());
     575        4000 : }
     576             : 
     577        2000 : void AMDGPUPassConfig::addStraightLineScalarOptimizationPasses() {
     578        2000 :   addPass(createSeparateConstOffsetFromGEPPass());
     579        2000 :   addPass(createSpeculativeExecutionPass());
     580             :   // ReassociateGEPs exposes more opportunites for SLSR. See
     581             :   // the example in reassociate-geps-and-slsr.ll.
     582        2000 :   addPass(createStraightLineStrengthReducePass());
     583             :   // SeparateConstOffsetFromGEP and SLSR creates common expressions which GVN or
     584             :   // EarlyCSE can reuse.
     585        2000 :   addEarlyCSEOrGVNPass();
     586             :   // Run NaryReassociate after EarlyCSE/GVN to be more effective.
     587        2000 :   addPass(createNaryReassociatePass());
     588             :   // NaryReassociate on GEPs creates redundant common expressions, so run
     589             :   // EarlyCSE after it.
     590        2000 :   addPass(createEarlyCSEPass());
     591        2000 : }
     592             : 
     593        2045 : void AMDGPUPassConfig::addIRPasses() {
     594             :   const AMDGPUTargetMachine &TM = getAMDGPUTargetMachine();
     595             : 
     596             :   // There is no reason to run these.
     597        2045 :   disablePass(&StackMapLivenessID);
     598        2045 :   disablePass(&FuncletLayoutID);
     599        2045 :   disablePass(&PatchableFunctionID);
     600             : 
     601        2045 :   addPass(createAMDGPULowerIntrinsicsPass());
     602             : 
     603        3814 :   if (TM.getTargetTriple().getArch() == Triple::r600 ||
     604             :       !EnableAMDGPUFunctionCalls) {
     605             :     // Function calls are not supported, so make sure we inline everything.
     606        2045 :     addPass(createAMDGPUAlwaysInlinePass());
     607        2045 :     addPass(createAlwaysInlinerLegacyPass());
     608             :     // We need to add the barrier noop pass, otherwise adding the function
     609             :     // inlining pass will cause all of the PassConfigs passes to be run
     610             :     // one function at a time, which means if we have a nodule with two
     611             :     // functions, then we will generate code for the first function
     612             :     // without ever running any passes on the second.
     613        2045 :     addPass(createBarrierNoopPass());
     614             :   }
     615             : 
     616        2045 :   if (TM.getTargetTriple().getArch() == Triple::amdgcn) {
     617             :     // TODO: May want to move later or split into an early and late one.
     618             : 
     619        1769 :     addPass(createAMDGPUCodeGenPreparePass());
     620             :   }
     621             : 
     622             :   // Handle uses of OpenCL image2d_t, image3d_t and sampler_t arguments.
     623        2045 :   if (TM.getTargetTriple().getArch() == Triple::r600)
     624         276 :     addPass(createR600OpenCLImageTypeLoweringPass());
     625             : 
     626             :   // Replace OpenCL enqueued block function pointers with global variables.
     627        2045 :   addPass(createAMDGPUOpenCLEnqueuedBlockLoweringPass());
     628             : 
     629        2045 :   if (TM.getOptLevel() > CodeGenOpt::None) {
     630        2000 :     addPass(createInferAddressSpacesPass());
     631        2000 :     addPass(createAMDGPUPromoteAlloca());
     632             : 
     633        2000 :     if (EnableSROA)
     634        1974 :       addPass(createSROAPass());
     635             : 
     636        2000 :     addStraightLineScalarOptimizationPasses();
     637             : 
     638        2000 :     if (EnableAMDGPUAliasAnalysis) {
     639        1987 :       addPass(createAMDGPUAAWrapperPass());
     640        3974 :       addPass(createExternalAAWrapperPass([](Pass &P, Function &,
     641       91084 :                                              AAResults &AAR) {
     642       91084 :         if (auto *WrapperPass = P.getAnalysisIfAvailable<AMDGPUAAWrapperPass>())
     643       91084 :           AAR.addAAResult(WrapperPass->getResult());
     644       91084 :         }));
     645             :     }
     646             :   }
     647             : 
     648        2045 :   TargetPassConfig::addIRPasses();
     649             : 
     650             :   // EarlyCSE is not always strong enough to clean up what LSR produces. For
     651             :   // example, GVN can combine
     652             :   //
     653             :   //   %0 = add %a, %b
     654             :   //   %1 = add %b, %a
     655             :   //
     656             :   // and
     657             :   //
     658             :   //   %0 = shl nsw %a, 2
     659             :   //   %1 = shl %a, 2
     660             :   //
     661             :   // but EarlyCSE can do neither of them.
     662        2045 :   if (getOptLevel() != CodeGenOpt::None)
     663        2000 :     addEarlyCSEOrGVNPass();
     664        2045 : }
     665             : 
     666        2045 : void AMDGPUPassConfig::addCodeGenPrepare() {
     667        2045 :   TargetPassConfig::addCodeGenPrepare();
     668             : 
     669        2045 :   if (EnableLoadStoreVectorizer)
     670        2034 :     addPass(createLoadStoreVectorizerPass());
     671        2045 : }
     672             : 
     673        2045 : bool AMDGPUPassConfig::addPreISel() {
     674        2045 :   addPass(createFlattenCFGPass());
     675        2045 :   return false;
     676             : }
     677             : 
     678        1763 : bool AMDGPUPassConfig::addInstSelector() {
     679        3526 :   addPass(createAMDGPUISelDag(&getAMDGPUTargetMachine(), getOptLevel()));
     680        1763 :   return false;
     681             : }
     682             : 
     683        2045 : bool AMDGPUPassConfig::addGCPasses() {
     684             :   // Do nothing. GC is not supported.
     685        2045 :   return false;
     686             : }
     687             : 
     688             : //===----------------------------------------------------------------------===//
     689             : // R600 Pass Setup
     690             : //===----------------------------------------------------------------------===//
     691             : 
     692         276 : bool R600PassConfig::addPreISel() {
     693         276 :   AMDGPUPassConfig::addPreISel();
     694             : 
     695         276 :   if (EnableR600StructurizeCFG)
     696         274 :     addPass(createStructurizeCFGPass());
     697         276 :   return false;
     698             : }
     699             : 
     700         276 : bool R600PassConfig::addInstSelector() {
     701         552 :   addPass(createR600ISelDag(&getAMDGPUTargetMachine(), getOptLevel()));
     702         276 :   return false;
     703             : }
     704             : 
     705         276 : void R600PassConfig::addPreRegAlloc() {
     706         276 :   addPass(createR600VectorRegMerger());
     707         276 : }
     708             : 
     709         276 : void R600PassConfig::addPreSched2() {
     710         276 :   addPass(createR600EmitClauseMarkers(), false);
     711         276 :   if (EnableR600IfConvert)
     712         275 :     addPass(&IfConverterID, false);
     713         276 :   addPass(createR600ClauseMergePass(), false);
     714         276 : }
     715             : 
     716         276 : void R600PassConfig::addPreEmitPass() {
     717         276 :   addPass(createAMDGPUCFGStructurizerPass(), false);
     718         276 :   addPass(createR600ExpandSpecialInstrsPass(), false);
     719         276 :   addPass(&FinalizeMachineBundlesID, false);
     720         276 :   addPass(createR600Packetizer(), false);
     721         276 :   addPass(createR600ControlFlowFinalizer(), false);
     722         276 : }
     723             : 
     724         280 : TargetPassConfig *R600TargetMachine::createPassConfig(PassManagerBase &PM) {
     725         560 :   return new R600PassConfig(*this, PM);
     726             : }
     727             : 
     728             : //===----------------------------------------------------------------------===//
     729             : // GCN Pass Setup
     730             : //===----------------------------------------------------------------------===//
     731             : 
     732       17325 : ScheduleDAGInstrs *GCNPassConfig::createMachineScheduler(
     733             :   MachineSchedContext *C) const {
     734       17325 :   const SISubtarget &ST = C->MF->getSubtarget<SISubtarget>();
     735       17325 :   if (ST.enableSIScheduler())
     736           0 :     return createSIMachineScheduler(C);
     737       17325 :   return createGCNMaxOccupancyMachineScheduler(C);
     738             : }
     739             : 
     740        1769 : bool GCNPassConfig::addPreISel() {
     741        1769 :   AMDGPUPassConfig::addPreISel();
     742             : 
     743             :   // FIXME: We need to run a pass to propagate the attributes when calls are
     744             :   // supported.
     745        1769 :   addPass(createAMDGPUAnnotateKernelFeaturesPass());
     746             : 
     747             :   // Merge divergent exit nodes. StructurizeCFG won't recognize the multi-exit
     748             :   // regions formed by them.
     749        1769 :   addPass(&AMDGPUUnifyDivergentExitNodesID);
     750        1769 :   if (!LateCFGStructurize) {
     751        1769 :     addPass(createStructurizeCFGPass(true)); // true -> SkipUniformRegions
     752             :   }
     753        1769 :   addPass(createSinkingPass());
     754        1769 :   addPass(createAMDGPUAnnotateUniformValues());
     755        1769 :   if (!LateCFGStructurize) {
     756        1769 :     addPass(createSIAnnotateControlFlowPass());
     757             :   }
     758             : 
     759        1769 :   return false;
     760             : }
     761             : 
     762        1725 : void GCNPassConfig::addMachineSSAOptimization() {
     763        1725 :   TargetPassConfig::addMachineSSAOptimization();
     764             : 
     765             :   // We want to fold operands after PeepholeOptimizer has run (or as part of
     766             :   // it), because it will eliminate extra copies making it easier to fold the
     767             :   // real source operand. We want to eliminate dead instructions after, so that
     768             :   // we see fewer uses of the copies. We then need to clean up the dead
     769             :   // instructions leftover after the operands are folded as well.
     770             :   //
     771             :   // XXX - Can we get away without running DeadMachineInstructionElim again?
     772        1725 :   addPass(&SIFoldOperandsID);
     773        1725 :   addPass(&DeadMachineInstructionElimID);
     774        1725 :   addPass(&SILoadStoreOptimizerID);
     775        1725 :   if (EnableSDWAPeephole) {
     776        1721 :     addPass(&SIPeepholeSDWAID);
     777        1721 :     addPass(&EarlyMachineLICMID);
     778        1721 :     addPass(&MachineCSEID);
     779        1721 :     addPass(&SIFoldOperandsID);
     780        1721 :     addPass(&DeadMachineInstructionElimID);
     781             :   }
     782        1725 :   addPass(createSIShrinkInstructionsPass());
     783        1725 : }
     784             : 
     785        1725 : bool GCNPassConfig::addILPOpts() {
     786        1725 :   if (EnableEarlyIfConversion)
     787           2 :     addPass(&EarlyIfConverterID);
     788             : 
     789             :   TargetPassConfig::addILPOpts();
     790        1725 :   return false;
     791             : }
     792             : 
     793        1763 : bool GCNPassConfig::addInstSelector() {
     794        1763 :   AMDGPUPassConfig::addInstSelector();
     795        1763 :   addPass(createSILowerI1CopiesPass());
     796        1763 :   addPass(&SIFixSGPRCopiesID);
     797        1763 :   return false;
     798             : }
     799             : 
     800           7 : bool GCNPassConfig::addIRTranslator() {
     801           7 :   addPass(new IRTranslator());
     802           7 :   return false;
     803             : }
     804             : 
     805           7 : bool GCNPassConfig::addLegalizeMachineIR() {
     806           7 :   addPass(new Legalizer());
     807           7 :   return false;
     808             : }
     809             : 
     810           7 : bool GCNPassConfig::addRegBankSelect() {
     811           7 :   addPass(new RegBankSelect());
     812           7 :   return false;
     813             : }
     814             : 
     815           7 : bool GCNPassConfig::addGlobalInstructionSelect() {
     816           7 :   addPass(new InstructionSelect());
     817           7 :   return false;
     818             : }
     819             : 
     820        1769 : void GCNPassConfig::addPreRegAlloc() {
     821        1769 :   if (LateCFGStructurize) {
     822           0 :     addPass(createAMDGPUMachineCFGStructurizerPass());
     823             :   }
     824        1769 :   addPass(createSIWholeQuadModePass());
     825        1769 : }
     826             : 
     827          44 : void GCNPassConfig::addFastRegAlloc(FunctionPass *RegAllocPass) {
     828             :   // FIXME: We have to disable the verifier here because of PHIElimination +
     829             :   // TwoAddressInstructions disabling it.
     830             : 
     831             :   // This must be run immediately after phi elimination and before
     832             :   // TwoAddressInstructions, otherwise the processing of the tied operand of
     833             :   // SI_ELSE will introduce a copy of the tied operand source after the else.
     834          88 :   insertPass(&PHIEliminationID, &SILowerControlFlowID, false);
     835             : 
     836             :   // This must be run after SILowerControlFlow, since it needs to use the
     837             :   // machine-level CFG, but before register allocation.
     838          88 :   insertPass(&SILowerControlFlowID, &SIFixWWMLivenessID, false);
     839             : 
     840          44 :   TargetPassConfig::addFastRegAlloc(RegAllocPass);
     841          44 : }
     842             : 
     843        1725 : void GCNPassConfig::addOptimizedRegAlloc(FunctionPass *RegAllocPass) {
     844        3450 :   insertPass(&MachineSchedulerID, &SIOptimizeExecMaskingPreRAID);
     845             : 
     846             :   // This must be run immediately after phi elimination and before
     847             :   // TwoAddressInstructions, otherwise the processing of the tied operand of
     848             :   // SI_ELSE will introduce a copy of the tied operand source after the else.
     849        3450 :   insertPass(&PHIEliminationID, &SILowerControlFlowID, false);
     850             : 
     851             :   // This must be run after SILowerControlFlow, since it needs to use the
     852             :   // machine-level CFG, but before register allocation.
     853        3450 :   insertPass(&SILowerControlFlowID, &SIFixWWMLivenessID, false);
     854             : 
     855        1725 :   TargetPassConfig::addOptimizedRegAlloc(RegAllocPass);
     856        1725 : }
     857             : 
     858        1769 : void GCNPassConfig::addPostRegAlloc() {
     859        1769 :   addPass(&SIFixVGPRCopiesID);
     860        1769 :   addPass(&SIOptimizeExecMaskingID);
     861             :   TargetPassConfig::addPostRegAlloc();
     862        1769 : }
     863             : 
     864        1769 : void GCNPassConfig::addPreSched2() {
     865        1769 : }
     866             : 
     867        1769 : void GCNPassConfig::addPreEmitPass() {
     868             :   // The hazard recognizer that runs as part of the post-ra scheduler does not
     869             :   // guarantee to be able handle all hazards correctly. This is because if there
     870             :   // are multiple scheduling regions in a basic block, the regions are scheduled
     871             :   // bottom up, so when we begin to schedule a region we don't know what
     872             :   // instructions were emitted directly before it.
     873             :   //
     874             :   // Here we add a stand-alone hazard recognizer pass which can handle all
     875             :   // cases.
     876        1769 :   addPass(&PostRAHazardRecognizerID);
     877             : 
     878        1769 :   addPass(createSIMemoryLegalizerPass());
     879        1769 :   addPass(createSIInsertWaitcntsPass());
     880        1769 :   addPass(createSIShrinkInstructionsPass());
     881        1769 :   addPass(&SIInsertSkipsPassID);
     882        1769 :   addPass(createSIDebuggerInsertNopsPass());
     883        1769 :   addPass(&BranchRelaxationPassID);
     884        1769 : }
     885             : 
     886        2114 : TargetPassConfig *GCNTargetMachine::createPassConfig(PassManagerBase &PM) {
     887        2114 :   return new GCNPassConfig(*this, PM);
     888      297711 : }

Generated by: LCOV version 1.13