LLVM  7.0.0svn
AMDGPUTargetMachine.cpp
Go to the documentation of this file.
1 //===-- AMDGPUTargetMachine.cpp - TargetMachine for hw codegen targets-----===//
2 //
3 // The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 /// \file
11 /// \brief The AMDGPU target machine contains all of the hardware specific
12 /// information needed to emit code for R600 and SI GPUs.
13 //
14 //===----------------------------------------------------------------------===//
15 
16 #include "AMDGPUTargetMachine.h"
17 #include "AMDGPU.h"
18 #include "AMDGPUAliasAnalysis.h"
19 #include "AMDGPUCallLowering.h"
21 #include "AMDGPULegalizerInfo.h"
22 #include "AMDGPUMacroFusion.h"
23 #include "AMDGPUTargetObjectFile.h"
25 #include "GCNIterativeScheduler.h"
26 #include "GCNSchedStrategy.h"
27 #include "R600MachineScheduler.h"
28 #include "SIMachineScheduler.h"
33 #include "llvm/CodeGen/Passes.h"
36 #include "llvm/IR/Attributes.h"
37 #include "llvm/IR/Function.h"
39 #include "llvm/Pass.h"
41 #include "llvm/Support/Compiler.h"
43 #include "llvm/Transforms/IPO.h"
46 #include "llvm/Transforms/Scalar.h"
49 #include <memory>
50 
51 using namespace llvm;
52 
54  "r600-ir-structurize",
55  cl::desc("Use StructurizeCFG IR pass"),
56  cl::init(true));
57 
59  "amdgpu-sroa",
60  cl::desc("Run SROA after promote alloca pass"),
62  cl::init(true));
63 
64 static cl::opt<bool>
65 EnableEarlyIfConversion("amdgpu-early-ifcvt", cl::Hidden,
66  cl::desc("Run early if-conversion"),
67  cl::init(false));
68 
70  "r600-if-convert",
71  cl::desc("Use if conversion pass"),
73  cl::init(true));
74 
75 // Option to disable vectorizer for tests.
77  "amdgpu-load-store-vectorizer",
78  cl::desc("Enable load store vectorizer"),
79  cl::init(true),
80  cl::Hidden);
81 
82 // Option to control global loads scalarization
84  "amdgpu-scalarize-global-loads",
85  cl::desc("Enable global load scalarization"),
86  cl::init(true),
87  cl::Hidden);
88 
89 // Option to run internalize pass.
91  "amdgpu-internalize-symbols",
92  cl::desc("Enable elimination of non-kernel functions and unused globals"),
93  cl::init(false),
94  cl::Hidden);
95 
96 // Option to inline all early.
98  "amdgpu-early-inline-all",
99  cl::desc("Inline all functions early"),
100  cl::init(false),
101  cl::Hidden);
102 
104  "amdgpu-sdwa-peephole",
105  cl::desc("Enable SDWA peepholer"),
106  cl::init(true));
107 
108 // Enable address space based alias analysis
109 static cl::opt<bool> EnableAMDGPUAliasAnalysis("enable-amdgpu-aa", cl::Hidden,
110  cl::desc("Enable AMDGPU Alias Analysis"),
111  cl::init(true));
112 
113 // Option to enable new waitcnt insertion pass.
115  "enable-si-insert-waitcnts",
116  cl::desc("Use new waitcnt insertion pass"),
117  cl::init(true));
118 
119 // Option to run late CFG structurizer
121  "amdgpu-late-structurize",
122  cl::desc("Enable late CFG structurization"),
124  cl::Hidden);
125 
127  "amdgpu-function-calls",
128  cl::Hidden,
129  cl::desc("Enable AMDGPU function call support"),
130  cl::init(false));
131 
132 // Enable lib calls simplifications
134  "amdgpu-simplify-libcall",
135  cl::desc("Enable mdgpu library simplifications"),
136  cl::init(true),
137  cl::Hidden);
138 
139 extern "C" void LLVMInitializeAMDGPUTarget() {
140  // Register the target
143 
184 }
185 
186 static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) {
187  return llvm::make_unique<AMDGPUTargetObjectFile>();
188 }
189 
191  return new ScheduleDAGMILive(C, llvm::make_unique<R600SchedStrategy>());
192 }
193 
195  return new SIScheduleDAGMI(C);
196 }
197 
198 static ScheduleDAGInstrs *
200  ScheduleDAGMILive *DAG =
201  new GCNScheduleDAGMILive(C, make_unique<GCNMaxOccupancySchedStrategy>(C));
205  return DAG;
206 }
207 
208 static ScheduleDAGInstrs *
210  auto DAG = new GCNIterativeScheduler(C,
212  DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
213  DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
214  return DAG;
215 }
216 
218  return new GCNIterativeScheduler(C,
220 }
221 
222 static ScheduleDAGInstrs *
224  auto DAG = new GCNIterativeScheduler(C,
226  DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
227  DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
228  DAG->addMutation(createAMDGPUMacroFusionDAGMutation());
229  return DAG;
230 }
231 
233 R600SchedRegistry("r600", "Run R600's custom scheduler",
235 
237 SISchedRegistry("si", "Run SI's custom scheduler",
239 
241 GCNMaxOccupancySchedRegistry("gcn-max-occupancy",
242  "Run GCN scheduler to maximize occupancy",
244 
246 IterativeGCNMaxOccupancySchedRegistry("gcn-max-occupancy-experimental",
247  "Run GCN scheduler to maximize occupancy (experimental)",
249 
251 GCNMinRegSchedRegistry("gcn-minreg",
252  "Run GCN iterative scheduler for minimal register usage (experimental)",
254 
256 GCNILPSchedRegistry("gcn-ilp",
257  "Run GCN iterative scheduler for ILP scheduling (experimental)",
259 
260 static StringRef computeDataLayout(const Triple &TT) {
261  if (TT.getArch() == Triple::r600) {
262  // 32-bit pointers.
263  return "e-p:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128"
264  "-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-A5";
265  }
266 
267  // 32-bit private, local, and region pointers. 64-bit global, constant and
268  // flat.
269  return "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32"
270  "-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128"
271  "-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-A5";
272 }
273 
275 static StringRef getGPUOrDefault(const Triple &TT, StringRef GPU) {
276  if (!GPU.empty())
277  return GPU;
278 
279  if (TT.getArch() == Triple::amdgcn)
280  return "generic";
281 
282  return "r600";
283 }
284 
286  // The AMDGPU toolchain only supports generating shared objects, so we
287  // must always use PIC.
288  return Reloc::PIC_;
289 }
290 
292  if (CM)
293  return *CM;
294  return CodeModel::Small;
295 }
296 
298  StringRef CPU, StringRef FS,
299  TargetOptions Options,
302  CodeGenOpt::Level OptLevel)
303  : LLVMTargetMachine(T, computeDataLayout(TT), TT, getGPUOrDefault(TT, CPU),
304  FS, Options, getEffectiveRelocModel(RM),
305  getEffectiveCodeModel(CM), OptLevel),
306  TLOF(createTLOF(getTargetTriple())) {
307  AS = AMDGPU::getAMDGPUAS(TT);
308  initAsmInfo();
309 }
310 
312 
314 
316  Attribute GPUAttr = F.getFnAttribute("target-cpu");
317  return GPUAttr.hasAttribute(Attribute::None) ?
318  getTargetCPU() : GPUAttr.getValueAsString();
319 }
320 
322  Attribute FSAttr = F.getFnAttribute("target-features");
323 
324  return FSAttr.hasAttribute(Attribute::None) ?
326  FSAttr.getValueAsString();
327 }
328 
330  return createExternalAAWrapperPass([](Pass &P, Function &, AAResults &AAR) {
331  if (auto *WrapperPass = P.getAnalysisIfAvailable<AMDGPUAAWrapperPass>())
332  AAR.addAAResult(WrapperPass->getResult());
333  });
334 }
335 
336 /// Predicate for Internalize pass.
337 static bool mustPreserveGV(const GlobalValue &GV) {
338  if (const Function *F = dyn_cast<Function>(&GV))
339  return F->isDeclaration() || AMDGPU::isEntryFunctionCC(F->getCallingConv());
340 
341  return !GV.use_empty();
342 }
343 
345  Builder.DivergentTarget = true;
346 
347  bool EnableOpt = getOptLevel() > CodeGenOpt::None;
348  bool Internalize = InternalizeSymbols;
349  bool EarlyInline = EarlyInlineAll && EnableOpt && !EnableAMDGPUFunctionCalls;
350  bool AMDGPUAA = EnableAMDGPUAliasAnalysis && EnableOpt;
351  bool LibCallSimplify = EnableLibCallSimplify && EnableOpt;
352 
354  delete Builder.Inliner;
356  }
357 
358  if (Internalize) {
359  // If we're generating code, we always have the whole program available. The
360  // relocations expected for externally visible functions aren't supported,
361  // so make sure every non-entry function is hidden.
362  Builder.addExtension(
366  });
367  }
368 
369  Builder.addExtension(
371  [Internalize, EarlyInline, AMDGPUAA](const PassManagerBuilder &,
373  if (AMDGPUAA) {
376  }
378  if (Internalize) {
380  PM.add(createGlobalDCEPass());
381  }
382  if (EarlyInline)
384  });
385 
386  const auto &Opt = Options;
387  Builder.addExtension(
389  [AMDGPUAA, LibCallSimplify, &Opt](const PassManagerBuilder &,
391  if (AMDGPUAA) {
394  }
396  if (LibCallSimplify)
398  });
399 
400  Builder.addExtension(
403  // Add infer address spaces pass to the opt pipeline after inlining
404  // but before SROA to increase SROA opportunities.
406  });
407 }
408 
409 //===----------------------------------------------------------------------===//
410 // R600 Target Machine (R600 -> Cayman)
411 //===----------------------------------------------------------------------===//
412 
414  StringRef CPU, StringRef FS,
418  CodeGenOpt::Level OL, bool JIT)
419  : AMDGPUTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL) {
421 }
422 
424  const Function &F) const {
425  StringRef GPU = getGPUName(F);
426  StringRef FS = getFeatureString(F);
427 
428  SmallString<128> SubtargetKey(GPU);
429  SubtargetKey.append(FS);
430 
431  auto &I = SubtargetMap[SubtargetKey];
432  if (!I) {
433  // This needs to be done before we create a new subtarget since any
434  // creation will depend on the TM and the code generation flags on the
435  // function that reside in TargetOptions.
437  I = llvm::make_unique<R600Subtarget>(TargetTriple, GPU, FS, *this);
438  }
439 
440  return I.get();
441 }
442 
443 //===----------------------------------------------------------------------===//
444 // GCN Target Machine (SI+)
445 //===----------------------------------------------------------------------===//
446 
448  StringRef CPU, StringRef FS,
452  CodeGenOpt::Level OL, bool JIT)
453  : AMDGPUTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL) {}
454 
456  StringRef GPU = getGPUName(F);
457  StringRef FS = getFeatureString(F);
458 
459  SmallString<128> SubtargetKey(GPU);
460  SubtargetKey.append(FS);
461 
462  auto &I = SubtargetMap[SubtargetKey];
463  if (!I) {
464  // This needs to be done before we create a new subtarget since any
465  // creation will depend on the TM and the code generation flags on the
466  // function that reside in TargetOptions.
468  I = llvm::make_unique<SISubtarget>(TargetTriple, GPU, FS, *this);
469  }
470 
471  I->setScalarizeGlobalBehavior(ScalarizeGlobal);
472 
473  return I.get();
474 }
475 
476 //===----------------------------------------------------------------------===//
477 // AMDGPU Pass Setup
478 //===----------------------------------------------------------------------===//
479 
480 namespace {
481 
482 class AMDGPUPassConfig : public TargetPassConfig {
483 public:
484  AMDGPUPassConfig(LLVMTargetMachine &TM, PassManagerBase &PM)
485  : TargetPassConfig(TM, PM) {
486  // Exceptions and StackMaps are not supported, so these passes will never do
487  // anything.
488  disablePass(&StackMapLivenessID);
489  disablePass(&FuncletLayoutID);
490  }
491 
492  AMDGPUTargetMachine &getAMDGPUTargetMachine() const {
493  return getTM<AMDGPUTargetMachine>();
494  }
495 
497  createMachineScheduler(MachineSchedContext *C) const override {
501  return DAG;
502  }
503 
504  void addEarlyCSEOrGVNPass();
505  void addStraightLineScalarOptimizationPasses();
506  void addIRPasses() override;
507  void addCodeGenPrepare() override;
508  bool addPreISel() override;
509  bool addInstSelector() override;
510  bool addGCPasses() override;
511 };
512 
513 class R600PassConfig final : public AMDGPUPassConfig {
514 public:
515  R600PassConfig(LLVMTargetMachine &TM, PassManagerBase &PM)
516  : AMDGPUPassConfig(TM, PM) {}
517 
518  ScheduleDAGInstrs *createMachineScheduler(
519  MachineSchedContext *C) const override {
520  return createR600MachineScheduler(C);
521  }
522 
523  bool addPreISel() override;
524  bool addInstSelector() override;
525  void addPreRegAlloc() override;
526  void addPreSched2() override;
527  void addPreEmitPass() override;
528 };
529 
530 class GCNPassConfig final : public AMDGPUPassConfig {
531 public:
532  GCNPassConfig(LLVMTargetMachine &TM, PassManagerBase &PM)
533  : AMDGPUPassConfig(TM, PM) {
534  // It is necessary to know the register usage of the entire call graph. We
535  // allow calls without EnableAMDGPUFunctionCalls if they are marked
536  // noinline, so this is always required.
537  setRequiresCodeGenSCCOrder(true);
538  }
539 
540  GCNTargetMachine &getGCNTargetMachine() const {
541  return getTM<GCNTargetMachine>();
542  }
543 
545  createMachineScheduler(MachineSchedContext *C) const override;
546 
547  bool addPreISel() override;
548  void addMachineSSAOptimization() override;
549  bool addILPOpts() override;
550  bool addInstSelector() override;
551  bool addIRTranslator() override;
552  bool addLegalizeMachineIR() override;
553  bool addRegBankSelect() override;
554  bool addGlobalInstructionSelect() override;
555  void addFastRegAlloc(FunctionPass *RegAllocPass) override;
556  void addOptimizedRegAlloc(FunctionPass *RegAllocPass) override;
557  void addPreRegAlloc() override;
558  void addPostRegAlloc() override;
559  void addPreSched2() override;
560  void addPreEmitPass() override;
561 };
562 
563 } // end anonymous namespace
564 
567  return TargetTransformInfo(AMDGPUTTIImpl(this, F));
568 }
569 
570 void AMDGPUPassConfig::addEarlyCSEOrGVNPass() {
572  addPass(createGVNPass());
573  else
574  addPass(createEarlyCSEPass());
575 }
576 
577 void AMDGPUPassConfig::addStraightLineScalarOptimizationPasses() {
580  // ReassociateGEPs exposes more opportunites for SLSR. See
581  // the example in reassociate-geps-and-slsr.ll.
583  // SeparateConstOffsetFromGEP and SLSR creates common expressions which GVN or
584  // EarlyCSE can reuse.
585  addEarlyCSEOrGVNPass();
586  // Run NaryReassociate after EarlyCSE/GVN to be more effective.
587  addPass(createNaryReassociatePass());
588  // NaryReassociate on GEPs creates redundant common expressions, so run
589  // EarlyCSE after it.
590  addPass(createEarlyCSEPass());
591 }
592 
593 void AMDGPUPassConfig::addIRPasses() {
594  const AMDGPUTargetMachine &TM = getAMDGPUTargetMachine();
595 
596  // There is no reason to run these.
597  disablePass(&StackMapLivenessID);
598  disablePass(&FuncletLayoutID);
599  disablePass(&PatchableFunctionID);
600 
602 
603  if (TM.getTargetTriple().getArch() == Triple::r600 ||
605  // Function calls are not supported, so make sure we inline everything.
606  addPass(createAMDGPUAlwaysInlinePass());
608  // We need to add the barrier noop pass, otherwise adding the function
609  // inlining pass will cause all of the PassConfigs passes to be run
610  // one function at a time, which means if we have a nodule with two
611  // functions, then we will generate code for the first function
612  // without ever running any passes on the second.
613  addPass(createBarrierNoopPass());
614  }
615 
616  if (TM.getTargetTriple().getArch() == Triple::amdgcn) {
617  // TODO: May want to move later or split into an early and late one.
618 
620  }
621 
622  // Handle uses of OpenCL image2d_t, image3d_t and sampler_t arguments.
624 
625  // Replace OpenCL enqueued block function pointers with global variables.
627 
628  if (TM.getOptLevel() > CodeGenOpt::None) {
629  addPass(createInferAddressSpacesPass());
630  addPass(createAMDGPUPromoteAlloca());
631 
632  if (EnableSROA)
633  addPass(createSROAPass());
634 
635  addStraightLineScalarOptimizationPasses();
636 
638  addPass(createAMDGPUAAWrapperPass());
640  AAResults &AAR) {
641  if (auto *WrapperPass = P.getAnalysisIfAvailable<AMDGPUAAWrapperPass>())
642  AAR.addAAResult(WrapperPass->getResult());
643  }));
644  }
645  }
646 
648 
649  // EarlyCSE is not always strong enough to clean up what LSR produces. For
650  // example, GVN can combine
651  //
652  // %0 = add %a, %b
653  // %1 = add %b, %a
654  //
655  // and
656  //
657  // %0 = shl nsw %a, 2
658  // %1 = shl %a, 2
659  //
660  // but EarlyCSE can do neither of them.
661  if (getOptLevel() != CodeGenOpt::None)
662  addEarlyCSEOrGVNPass();
663 }
664 
665 void AMDGPUPassConfig::addCodeGenPrepare() {
667 
670 }
671 
672 bool AMDGPUPassConfig::addPreISel() {
673  addPass(createFlattenCFGPass());
674  return false;
675 }
676 
677 bool AMDGPUPassConfig::addInstSelector() {
678  addPass(createAMDGPUISelDag(&getAMDGPUTargetMachine(), getOptLevel()));
679  return false;
680 }
681 
682 bool AMDGPUPassConfig::addGCPasses() {
683  // Do nothing. GC is not supported.
684  return false;
685 }
686 
687 //===----------------------------------------------------------------------===//
688 // R600 Pass Setup
689 //===----------------------------------------------------------------------===//
690 
691 bool R600PassConfig::addPreISel() {
692  AMDGPUPassConfig::addPreISel();
693 
695  addPass(createStructurizeCFGPass());
696  return false;
697 }
698 
699 bool R600PassConfig::addInstSelector() {
700  addPass(createR600ISelDag(&getAMDGPUTargetMachine(), getOptLevel()));
701  return false;
702 }
703 
704 void R600PassConfig::addPreRegAlloc() {
705  addPass(createR600VectorRegMerger());
706 }
707 
708 void R600PassConfig::addPreSched2() {
709  addPass(createR600EmitClauseMarkers(), false);
711  addPass(&IfConverterID, false);
712  addPass(createR600ClauseMergePass(), false);
713 }
714 
715 void R600PassConfig::addPreEmitPass() {
716  addPass(createAMDGPUCFGStructurizerPass(), false);
717  addPass(createR600ExpandSpecialInstrsPass(), false);
718  addPass(&FinalizeMachineBundlesID, false);
719  addPass(createR600Packetizer(), false);
720  addPass(createR600ControlFlowFinalizer(), false);
721 }
722 
724  return new R600PassConfig(*this, PM);
725 }
726 
727 //===----------------------------------------------------------------------===//
728 // GCN Pass Setup
729 //===----------------------------------------------------------------------===//
730 
731 ScheduleDAGInstrs *GCNPassConfig::createMachineScheduler(
732  MachineSchedContext *C) const {
733  const SISubtarget &ST = C->MF->getSubtarget<SISubtarget>();
734  if (ST.enableSIScheduler())
735  return createSIMachineScheduler(C);
737 }
738 
739 bool GCNPassConfig::addPreISel() {
740  AMDGPUPassConfig::addPreISel();
741 
742  // FIXME: We need to run a pass to propagate the attributes when calls are
743  // supported.
745 
746  // Merge divergent exit nodes. StructurizeCFG won't recognize the multi-exit
747  // regions formed by them.
749  if (!LateCFGStructurize) {
750  addPass(createStructurizeCFGPass(true)); // true -> SkipUniformRegions
751  }
752  addPass(createSinkingPass());
754  if (!LateCFGStructurize) {
756  }
757 
758  return false;
759 }
760 
761 void GCNPassConfig::addMachineSSAOptimization() {
763 
764  // We want to fold operands after PeepholeOptimizer has run (or as part of
765  // it), because it will eliminate extra copies making it easier to fold the
766  // real source operand. We want to eliminate dead instructions after, so that
767  // we see fewer uses of the copies. We then need to clean up the dead
768  // instructions leftover after the operands are folded as well.
769  //
770  // XXX - Can we get away without running DeadMachineInstructionElim again?
771  addPass(&SIFoldOperandsID);
773  addPass(&SILoadStoreOptimizerID);
774  if (EnableSDWAPeephole) {
775  addPass(&SIPeepholeSDWAID);
776  addPass(&EarlyMachineLICMID);
777  addPass(&MachineCSEID);
778  addPass(&SIFoldOperandsID);
780  }
782 }
783 
784 bool GCNPassConfig::addILPOpts() {
786  addPass(&EarlyIfConverterID);
787 
789  return false;
790 }
791 
792 bool GCNPassConfig::addInstSelector() {
793  AMDGPUPassConfig::addInstSelector();
794  addPass(createSILowerI1CopiesPass());
795  addPass(&SIFixSGPRCopiesID);
796  return false;
797 }
798 
799 bool GCNPassConfig::addIRTranslator() {
800  addPass(new IRTranslator());
801  return false;
802 }
803 
804 bool GCNPassConfig::addLegalizeMachineIR() {
805  addPass(new Legalizer());
806  return false;
807 }
808 
809 bool GCNPassConfig::addRegBankSelect() {
810  addPass(new RegBankSelect());
811  return false;
812 }
813 
814 bool GCNPassConfig::addGlobalInstructionSelect() {
815  addPass(new InstructionSelect());
816  return false;
817 }
818 
819 void GCNPassConfig::addPreRegAlloc() {
820  if (LateCFGStructurize) {
822  }
823  addPass(createSIWholeQuadModePass());
824 }
825 
826 void GCNPassConfig::addFastRegAlloc(FunctionPass *RegAllocPass) {
827  // FIXME: We have to disable the verifier here because of PHIElimination +
828  // TwoAddressInstructions disabling it.
829 
830  // This must be run immediately after phi elimination and before
831  // TwoAddressInstructions, otherwise the processing of the tied operand of
832  // SI_ELSE will introduce a copy of the tied operand source after the else.
833  insertPass(&PHIEliminationID, &SILowerControlFlowID, false);
834 
835  // This must be run after SILowerControlFlow, since it needs to use the
836  // machine-level CFG, but before register allocation.
837  insertPass(&SILowerControlFlowID, &SIFixWWMLivenessID, false);
838 
839  TargetPassConfig::addFastRegAlloc(RegAllocPass);
840 }
841 
842 void GCNPassConfig::addOptimizedRegAlloc(FunctionPass *RegAllocPass) {
844 
845  // This must be run immediately after phi elimination and before
846  // TwoAddressInstructions, otherwise the processing of the tied operand of
847  // SI_ELSE will introduce a copy of the tied operand source after the else.
848  insertPass(&PHIEliminationID, &SILowerControlFlowID, false);
849 
850  // This must be run after SILowerControlFlow, since it needs to use the
851  // machine-level CFG, but before register allocation.
852  insertPass(&SILowerControlFlowID, &SIFixWWMLivenessID, false);
853 
855 }
856 
857 void GCNPassConfig::addPostRegAlloc() {
858  addPass(&SIFixVGPRCopiesID);
859  addPass(&SIOptimizeExecMaskingID);
861 }
862 
863 void GCNPassConfig::addPreSched2() {
864 }
865 
866 void GCNPassConfig::addPreEmitPass() {
867  // The hazard recognizer that runs as part of the post-ra scheduler does not
868  // guarantee to be able handle all hazards correctly. This is because if there
869  // are multiple scheduling regions in a basic block, the regions are scheduled
870  // bottom up, so when we begin to schedule a region we don't know what
871  // instructions were emitted directly before it.
872  //
873  // Here we add a stand-alone hazard recognizer pass which can handle all
874  // cases.
875  addPass(&PostRAHazardRecognizerID);
876 
877  addPass(createSIMemoryLegalizerPass());
879  addPass(createSIInsertWaitcntsPass());
880  else
881  addPass(createSIInsertWaitsPass());
883  addPass(&SIInsertSkipsPassID);
885  addPass(&BranchRelaxationPassID);
886 }
887 
889  return new GCNPassConfig(*this, PM);
890 }
FunctionPass * createSpeculativeExecutionPass()
Pass interface - Implemented by all &#39;passes&#39;.
Definition: Pass.h:81
FunctionPass * createStraightLineStrengthReducePass()
uint64_t CallInst * C
FunctionPass * createGVNPass(bool NoLoads=false)
Create a legacy GVN pass.
Definition: GVN.cpp:2666
StringRef getTargetFeatureString() const
Target & getTheGCNTarget()
The target for GCN GPUs.
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
void initializeAMDGPUDAGToDAGISelPass(PassRegistry &)
void addAAResult(AAResultT &AAResult)
Register a specific AA result.
This file a TargetTransformInfo::Concept conforming object specific to the AMDGPU target machine...
FunctionPass * createSIAnnotateControlFlowPass()
Create the annotation pass.
ModulePass * createAMDGPUAlwaysInlinePass(bool GlobalOpt=true)
static LLVM_READNONE StringRef getGPUOrDefault(const Triple &TT, StringRef GPU)
static PassRegistry * getPassRegistry()
getPassRegistry - Access the global registry object, which is automatically initialized at applicatio...
PassManagerBuilder - This class is used to set up a standard optimization sequence for languages like...
GCNTargetMachine(const Target &T, const Triple &TT, StringRef CPU, StringRef FS, TargetOptions Options, Optional< Reloc::Model > RM, Optional< CodeModel::Model > CM, CodeGenOpt::Level OL, bool JIT)
Compute iterated dominance frontiers using a linear time algorithm.
Definition: AllocatorList.h:24
void initializeSIFixVGPRCopiesPass(PassRegistry &)
virtual void addIRPasses()
Add common target configurable passes that perform LLVM IR to IR transforms following machine indepen...
static cl::opt< bool > EnableLoadStoreVectorizer("amdgpu-load-store-vectorizer", cl::desc("Enable load store vectorizer"), cl::init(true), cl::Hidden)
FunctionPass * createR600ISelDag(TargetMachine *TM, CodeGenOpt::Level OptLevel)
This pass converts a legalized DAG into a R600-specific.
void initializeSIInsertWaitcntsPass(PassRegistry &)
void initializeAMDGPUAnnotateKernelFeaturesPass(PassRegistry &)
Triple TargetTriple
Triple string, CPU name, and target feature strings the TargetMachine instance is created with...
Definition: TargetMachine.h:78
char & SILoadStoreOptimizerID
Target & getTheAMDGPUTarget()
The target which supports all AMD GPUs.
char & SIPeepholeSDWAID
void initializeAMDGPUSimplifyLibCallsPass(PassRegistry &)
This file describes how to lower LLVM calls to machine code calls.
char & FuncletLayoutID
This pass lays out funclets contiguously.
AMDGPUAS getAMDGPUAS(const Module &M)
char & EarlyIfConverterID
EarlyIfConverter - This pass performs if-conversion on SSA form by inserting cmov instructions...
void initializeR600ControlFlowFinalizerPass(PassRegistry &)
void initializeAMDGPUAnnotateUniformValuesPass(PassRegistry &)
FunctionPass * createAMDGPUPromoteAlloca()
virtual void add(Pass *P)=0
Add a pass to the queue of passes to run.
char & BranchRelaxationPassID
BranchRelaxation - This pass replaces branches that need to jump further than is supported by a branc...
FunctionPass * createAMDGPUCodeGenPreparePass()
F(f)
R600 Machine Scheduler interface.
char & MachineSchedulerID
MachineScheduler - This pass schedules machine instructions.
FunctionPass * createAMDGPUCFGStructurizerPass()
MachineSchedRegistry provides a selection of available machine instruction schedulers.
virtual void addMachineSSAOptimization()
addMachineSSAOptimization - Add standard passes that optimize machine instructions in SSA form...
void initializeAMDGPUAAWrapperPassPass(PassRegistry &)
static std::unique_ptr< TargetLoweringObjectFile > createTLOF(const Triple &TT)
void initializeAMDGPUPromoteAllocaPass(PassRegistry &)
FunctionPass * createSIMemoryLegalizerPass()
Pass * Inliner
Inliner - Specifies the inliner to use.
FunctionPass * createAMDGPUMachineCFGStructurizerPass()
FunctionPass * createSIInsertWaitcntsPass()
ScheduleDAGMILive is an implementation of ScheduleDAGInstrs that schedules machine instructions while...
StringRef getFeatureString(const Function &F) const
static GCMetadataPrinterRegistry::Add< OcamlGCMetadataPrinter > Y("ocaml", "ocaml 3.10-compatible collector")
void resetTargetOptions(const Function &F) const
Reset the target options based on the function&#39;s attributes.
This file declares the targeting of the InstructionSelector class for AMDGPU.
Pass * createAMDGPUFunctionInliningPass()
static cl::opt< bool > EnableSDWAPeephole("amdgpu-sdwa-peephole", cl::desc("Enable SDWA peepholer"), cl::init(true))
const AMDGPUSubtarget * getSubtargetImpl() const
This file declares the AMDGPU-specific subclass of TargetLoweringObjectFile.
Pass * createAMDGPUAnnotateKernelFeaturesPass()
void initializeAMDGPUCodeGenPreparePass(PassRegistry &)
This file contains the simple types necessary to represent the attributes associated with functions a...
ModulePass * createAMDGPUOpenCLEnqueuedBlockLoweringPass()
No attributes have been set.
Definition: Attributes.h:72
void initializeAMDGPUInlinerPass(PassRegistry &)
FunctionPass * createSinkingPass()
Definition: Sink.cpp:306
static MachineSchedRegistry GCNILPSchedRegistry("gcn-ilp", "Run GCN iterative scheduler for ILP scheduling (experimental)", createIterativeILPMachineScheduler)
char & SIOptimizeExecMaskingPreRAID
EP_ModuleOptimizerEarly - This extension point allows adding passes just before the main module-level...
char & FinalizeMachineBundlesID
FinalizeMachineBundles - This pass finalize machine instruction bundles (created earlier, e.g.
Target-Independent Code Generator Pass Configuration Options.
static StringRef computeDataLayout(const Triple &TT)
static cl::opt< bool, true > LateCFGStructurize("amdgpu-late-structurize", cl::desc("Enable late CFG structurization"), cl::location(AMDGPUTargetMachine::EnableLateStructurizeCFG), cl::Hidden)
EP_EnabledOnOptLevel0 - This extension point allows adding passes that should not be disabled by O0 o...
static cl::opt< bool > ScalarizeGlobal("amdgpu-scalarize-global-loads", cl::desc("Enable global load scalarization"), cl::init(true), cl::Hidden)
FunctionPass * createR600ExpandSpecialInstrsPass()
static MachineSchedRegistry GCNMinRegSchedRegistry("gcn-minreg", "Run GCN iterative scheduler for minimal register usage (experimental)", createMinRegScheduler)
RegisterTargetMachine - Helper template for registering a target machine implementation, for use in the target machine initialization function.
char & MachineCSEID
MachineCSE - This pass performs global CSE on machine instructions.
Definition: MachineCSE.cpp:134
ArchType getArch() const
getArch - Get the parsed architecture type of this triple.
Definition: Triple.h:285
LLVM_NODISCARD LLVM_ATTRIBUTE_ALWAYS_INLINE bool empty() const
empty - Check if the string is empty.
Definition: StringRef.h:133
FunctionPass * createSILowerI1CopiesPass()
void initializeSIOptimizeExecMaskingPass(PassRegistry &)
char & DeadMachineInstructionElimID
DeadMachineInstructionElim - This pass removes dead machine instructions.
void initializeSIOptimizeExecMaskingPreRAPass(PassRegistry &)
Pass * createLoadStoreVectorizerPass()
ScheduleDAGMILive * createGenericSchedLive(MachineSchedContext *C)
Create the standard converging machine scheduler.
StringRef getTargetCPU() const
virtual bool addILPOpts()
Add passes that optimize instruction level parallelism for out-of-order targets.
void initializeR600ExpandSpecialInstrsPassPass(PassRegistry &)
void initializeSIFixSGPRCopiesPass(PassRegistry &)
void initializeAMDGPULowerIntrinsicsPass(PassRegistry &)
ModulePass * createGlobalDCEPass()
createGlobalDCEPass - This transform is designed to eliminate unreachable internal globals (functions...
FunctionPass * createR600VectorRegMerger()
static ScheduleDAGInstrs * createIterativeGCNMaxOccupancyMachineScheduler(MachineSchedContext *C)
static cl::opt< bool > InternalizeSymbols("amdgpu-internalize-symbols", cl::desc("Enable elimination of non-kernel functions and unused globals"), cl::init(false), cl::Hidden)
static CodeModel::Model getEffectiveCodeModel(Optional< CodeModel::Model > CM)
SI Machine Scheduler interface.
StringRef getGPUName(const Function &F) const
void append(in_iter S, in_iter E)
Append from an iterator pair.
Definition: SmallString.h:75
char & PHIEliminationID
PHIElimination - This pass eliminates machine instruction PHI nodes by inserting copy instructions...
ImmutablePass * createExternalAAWrapperPass(std::function< void(Pass &, Function &, AAResults &)> Callback)
A wrapper pass around a callback which can be used to populate the AAResults in the AAResultsWrapperP...
#define P(N)
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:406
bool hasAttribute(AttrKind Val) const
Return true if the attribute is present.
Definition: Attributes.cpp:202
CodeGenOpt::Level getOptLevel() const
Returns the optimization level: None, Less, Default, or Aggressive.
FunctionPass * createSeparateConstOffsetFromGEPPass(const TargetMachine *TM=nullptr, bool LowerGEP=false)
FunctionPass * createSIDebuggerInsertNopsPass()
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
static Reloc::Model getEffectiveRelocModel(Optional< Reloc::Model > RM)
This pass implements the reg bank selector pass used in the GlobalISel pipeline.
Definition: RegBankSelect.h:91
FunctionPass * createFlattenCFGPass()
FunctionPass * createSIWholeQuadModePass()
This file provides the interface for LLVM&#39;s Global Value Numbering pass which eliminates fully redund...
static cl::opt< bool > EarlyInlineAll("amdgpu-early-inline-all", cl::desc("Inline all functions early"), cl::init(false), cl::Hidden)
char & SIInsertSkipsPassID
virtual void addOptimizedRegAlloc(FunctionPass *RegAllocPass)
addOptimizedRegAlloc - Add passes related to register allocation.
static MachineSchedRegistry GCNMaxOccupancySchedRegistry("gcn-max-occupancy", "Run GCN scheduler to maximize occupancy", createGCNMaxOccupancyMachineScheduler)
void adjustPassManager(PassManagerBuilder &) override
Allow the target to modify the pass manager, e.g.
bool isEntryFunctionCC(CallingConv::ID CC)
void LLVMInitializeAMDGPUTarget()
void initializeSIPeepholeSDWAPass(PassRegistry &)
static cl::opt< bool > EnableSROA("amdgpu-sroa", cl::desc("Run SROA after promote alloca pass"), cl::ReallyHidden, cl::init(true))
TargetPassConfig * createPassConfig(PassManagerBase &PM) override
Create a pass configuration object to be used by addPassToEmitX methods for generating a pipeline of ...
FunctionPass * createR600ControlFlowFinalizer()
char & SIFixWWMLivenessID
static cl::opt< bool > EnableAMDGPUFunctionCalls("amdgpu-function-calls", cl::Hidden, cl::desc("Enable AMDGPU function call support"), cl::init(false))
Legacy wrapper pass to provide the AMDGPUAAResult object.
R600TargetMachine(const Target &T, const Triple &TT, StringRef CPU, StringRef FS, TargetOptions Options, Optional< Reloc::Model > RM, Optional< CodeModel::Model > CM, CodeGenOpt::Level OL, bool JIT)
This class describes a target machine that is implemented with the LLVM target-independent code gener...
ModulePass * createBarrierNoopPass()
createBarrierNoopPass - This pass is purely a module pass barrier in a pass manager.
FunctionPass class - This class is used to implement most global optimizations.
Definition: Pass.h:285
Pass * createAlwaysInlinerLegacyPass(bool InsertLifetime=true)
Create a legacy pass manager instance of a pass to inline and remove functions marked as "always_inli...
const Triple & getTargetTriple() const
static MachineSchedRegistry SISchedRegistry("si", "Run SI's custom scheduler", createSIMachineScheduler)
void initializeSILowerControlFlowPass(PassRegistry &)
static ScheduleDAGInstrs * createMinRegScheduler(MachineSchedContext *C)
ModulePass * createAMDGPULowerIntrinsicsPass()
virtual void addCodeGenPrepare()
Add pass to prepare the LLVM IR for code generation.
FunctionPass * createR600ClauseMergePass()
The AMDGPU TargetMachine interface definition for hw codgen targets.
static cl::opt< bool > EnableR600IfConvert("r600-if-convert", cl::desc("Use if conversion pass"), cl::ReallyHidden, cl::init(true))
std::unique_ptr< ScheduleDAGMutation > createStoreClusterDAGMutation(const TargetInstrInfo *TII, const TargetRegisterInfo *TRI)
static ScheduleDAGInstrs * createR600MachineScheduler(MachineSchedContext *C)
FunctionPass * createAMDGPUISelDag(TargetMachine *TM=nullptr, CodeGenOpt::Level OptLevel=CodeGenOpt::Default)
This pass converts a legalized DAG into a AMDGPU-specific.
std::unique_ptr< ScheduleDAGMutation > createAMDGPUMacroFusionDAGMutation()
Note that you have to add: DAG.addMutation(createAMDGPUMacroFusionDAGMutation()); to AMDGPUPassConfig...
static cl::opt< bool > EnableLibCallSimplify("amdgpu-simplify-libcall", cl::desc("Enable mdgpu library simplifications"), cl::init(true), cl::Hidden)
void initializeSIShrinkInstructionsPass(PassRegistry &)
void initializeAMDGPUUseNativeCallsPass(PassRegistry &)
Analysis pass providing a never-invalidated alias analysis result.
EP_EarlyAsPossible - This extension point allows adding passes before any other transformations, allowing them to see the code as it is coming out of the frontend.
AMDGPUTargetMachine(const Target &T, const Triple &TT, StringRef CPU, StringRef FS, TargetOptions Options, Optional< Reloc::Model > RM, Optional< CodeModel::Model > CM, CodeGenOpt::Level OL)
void initializeSIInsertSkipsPass(PassRegistry &)
void initializeR600PacketizerPass(PassRegistry &)
Triple - Helper class for working with autoconf configuration names.
Definition: Triple.h:44
FunctionPass * createAMDGPUAnnotateUniformValues()
This is the AMGPU address space based alias analysis pass.
Provides passes to inlining "always_inline" functions.
char & SIOptimizeExecMaskingID
EP_CGSCCOptimizerLate - This extension point allows adding CallGraphSCC passes at the end of the main...
static MachineSchedRegistry IterativeGCNMaxOccupancySchedRegistry("gcn-max-occupancy-experimental", "Run GCN scheduler to maximize occupancy (experimental)", createIterativeGCNMaxOccupancyMachineScheduler)
std::unique_ptr< ScheduleDAGMutation > createLoadClusterDAGMutation(const TargetInstrInfo *TII, const TargetRegisterInfo *TRI)
ImmutablePass class - This class is used to provide information that does not need to be run...
Definition: Pass.h:256
char & AMDGPUUnifyDivergentExitNodesID
void initializeSIFixWWMLivenessPass(PassRegistry &)
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
void initializeSIMemoryLegalizerPass(PassRegistry &)
PassManagerBase - An abstract interface to allow code to add passes to a pass manager without having ...
char & StackMapLivenessID
StackMapLiveness - This pass analyses the register live-out set of stackmap/patchpoint intrinsics and...
ModulePass * createInternalizePass(std::function< bool(const GlobalValue &)> MustPreserveGV)
createInternalizePass - This pass loops over all of the functions in the input module, internalizing all globals (functions and variables) it can.
void initializeSIWholeQuadModePass(PassRegistry &)
void setRequiresStructuredCFG(bool Value)
void initializeR600VectorRegMergerPass(PassRegistry &)
ImmutablePass * createAMDGPUAAWrapperPass()
char & SIFixVGPRCopiesID
void initializeAMDGPURewriteOutArgumentsPass(PassRegistry &)
char & EarlyMachineLICMID
This pass performs loop invariant code motion on machine instructions.
static cl::opt< bool > EnableSIInsertWaitcntsPass("enable-si-insert-waitcnts", cl::desc("Use new waitcnt insertion pass"), cl::init(true))
FunctionPass * createR600EmitClauseMarkers()
void initializeR600ClauseMergePassPass(PassRegistry &)
This pass is responsible for selecting generic machine instructions to target-specific instructions...
void initializeAMDGPUUnifyMetadataPass(PassRegistry &)
Target - Wrapper for Target specific information.
void initializeAMDGPUArgumentUsageInfoPass(PassRegistry &)
virtual void addPostRegAlloc()
This method may be implemented by targets that want to run passes after register allocation pass pipe...
This file declares the targeting of the Machinelegalizer class for AMDGPU.
virtual void addFastRegAlloc(FunctionPass *RegAllocPass)
addFastRegAlloc - Add the minimum set of target-independent passes that are required for fast registe...
FunctionPass * createR600Packetizer()
void initializeSILoadStoreOptimizerPass(PassRegistry &)
char & SILowerControlFlowID
ModulePass * createAMDGPUUnifyMetadataPass()
void initializeSIAnnotateControlFlowPass(PassRegistry &)
A ScheduleDAG for scheduling lists of MachineInstr.
char & PatchableFunctionID
This pass implements the "patchable-function" attribute.
FunctionPass * createInferAddressSpacesPass()
void initializeSIFoldOperandsPass(PassRegistry &)
char & SIFoldOperandsID
const TargetRegisterInfo * TRI
Target processor register info.
Definition: ScheduleDAG.h:569
FunctionPass * createSIShrinkInstructionsPass()
static cl::opt< bool > EnableAMDGPUAliasAnalysis("enable-amdgpu-aa", cl::Hidden, cl::desc("Enable AMDGPU Alias Analysis"), cl::init(true))
void initializeSIDebuggerInsertNopsPass(PassRegistry &)
StringRef getValueAsString() const
Return the attribute&#39;s value as a string.
Definition: Attributes.cpp:195
TargetTransformInfo getTargetTransformInfo(const Function &F) override
Get a TargetTransformInfo implementation for the target.
TargetOptions Options
Definition: TargetMachine.h:98
char & IfConverterID
IfConverter - This pass performs machine code if conversion.
#define LLVM_READNONE
Definition: Compiler.h:161
MachineSchedContext provides enough context from the MachineScheduler pass for the target to instanti...
char & SIFixSGPRCopiesID
void initializeSIInsertWaitsPass(PassRegistry &)
#define I(x, y, z)
Definition: MD5.cpp:58
FunctionPass * createAMDGPUSimplifyLibCallsPass(const TargetOptions &)
FunctionPass * createSROAPass()
Definition: SROA.cpp:4471
bool enableSIScheduler() const
static MachineSchedRegistry R600SchedRegistry("r600", "Run R600's custom scheduler", createR600MachineScheduler)
static bool mustPreserveGV(const GlobalValue &GV)
Predicate for Internalize pass.
const TargetInstrInfo * TII
Target instruction information.
Definition: ScheduleDAG.h:568
FunctionPass * createSIInsertWaitsPass()
This file declares the IRTranslator pass.
FunctionPass * createAMDGPUUseNativeCallsPass()
char & PostRAHazardRecognizerID
createPostRAHazardRecognizer - This pass runs the post-ra hazard recognizer.
AnalysisType * getAnalysisIfAvailable() const
getAnalysisIfAvailable<AnalysisType>() - Subclasses use this function to get analysis information tha...
Attribute getFnAttribute(Attribute::AttrKind Kind) const
Return the attribute for the given attribute kind.
Definition: Function.h:312
FunctionPass * createEarlyCSEPass(bool UseMemorySSA=false)
Definition: EarlyCSE.cpp:1171
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:49
PassRegistry - This class manages the registration and intitialization of the pass subsystem as appli...
Definition: PassRegistry.h:39
void initializeSILowerI1CopiesPass(PassRegistry &)
void addExtension(ExtensionPointTy Ty, ExtensionFn Fn)
static ScheduleDAGInstrs * createIterativeILPMachineScheduler(MachineSchedContext *C)
void initializeAMDGPUUnifyDivergentExitNodesPass(PassRegistry &)
ModulePass * createAMDGPUOpenCLImageTypeLoweringPass()
static cl::opt< bool > EnableR600StructurizeCFG("r600-ir-structurize", cl::desc("Use StructurizeCFG IR pass"), cl::init(true))
Pass * createStructurizeCFGPass(bool SkipUniformRegions=false)
When SkipUniformRegions is true the structizer will not structurize regions that only contain uniform...
void initializeAMDGPUAlwaysInlinePass(PassRegistry &)
TargetPassConfig * createPassConfig(PassManagerBase &PM) override
Create a pass configuration object to be used by addPassToEmitX methods for generating a pipeline of ...
void initializeAMDGPUOpenCLEnqueuedBlockLoweringPass(PassRegistry &)
static ImmutablePass * createAMDGPUExternalAAWrapperPass()
bool use_empty() const
Definition: Value.h:328
LocationClass< Ty > location(Ty &L)
Definition: CommandLine.h:422
static ScheduleDAGInstrs * createSIMachineScheduler(MachineSchedContext *C)
static ScheduleDAGInstrs * createGCNMaxOccupancyMachineScheduler(MachineSchedContext *C)
static cl::opt< bool > EnableEarlyIfConversion("amdgpu-early-ifcvt", cl::Hidden, cl::desc("Run early if-conversion"), cl::init(false))
void addMutation(std::unique_ptr< ScheduleDAGMutation > Mutation)
Add a postprocessing step to the DAG builder.
FunctionPass * createNaryReassociatePass()